In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
allticks_data = pd.read_csv("all_ticks_wide.csv")

allticks_data
Out[3]:
timestamp AEFES AKBNK AKSA AKSEN ALARK ALBRK ANACM ARCLK ASELS ... TTKOM TUKAS TUPRS USAK VAKBN VESTL YATAS YKBNK YUNSA ZOREN
0 2012-09-17T06:45:00Z 22.3978 5.2084 1.7102 3.87 1.4683 1.1356 1.0634 6.9909 2.9948 ... 4.2639 0.96 29.8072 1.0382 3.8620 1.90 0.4172 2.5438 2.2619 0.7789
1 2012-09-17T07:00:00Z 22.3978 5.1938 1.7066 3.86 1.4574 1.1275 1.0634 6.9259 2.9948 ... 4.2521 0.96 29.7393 1.0382 3.8529 1.90 0.4229 2.5266 2.2462 0.7789
2 2012-09-17T07:15:00Z 22.3978 5.2084 1.7102 NaN 1.4610 1.1356 1.0679 6.9909 2.9855 ... 4.2521 0.97 29.6716 1.0463 3.8436 1.91 0.4229 2.5266 2.2566 0.7789
3 2012-09-17T07:30:00Z 22.3978 5.1938 1.7102 3.86 1.4537 1.1275 1.0679 6.9584 2.9855 ... 4.2521 0.97 29.7393 1.0382 3.8529 1.91 0.4286 2.5324 2.2619 0.7860
4 2012-09-17T07:45:00Z 22.5649 5.2084 1.7102 3.87 1.4574 1.1356 1.0725 6.9909 2.9760 ... 4.2521 0.97 29.8072 1.0382 3.8620 1.90 0.4286 2.5324 2.2619 0.7789
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
50007 2019-07-23T14:00:00Z 20.4800 7.7300 9.1400 2.47 3.2300 1.2100 2.8400 20.3000 NaN ... 5.6000 4.34 131.6000 1.0500 4.8600 9.98 5.3500 2.7500 4.2500 NaN
50008 2019-07-23T14:15:00Z 20.5000 7.7200 9.1400 2.47 3.2200 1.2100 2.8400 20.3200 NaN ... 5.5700 4.35 131.5000 1.0500 4.8600 9.98 5.3400 2.7500 4.2400 NaN
50009 2019-07-23T14:30:00Z 20.5000 7.7400 9.1300 2.46 3.2300 1.2100 2.8300 20.3400 NaN ... 5.5700 4.36 131.5000 1.0500 4.8600 9.96 5.3400 2.7600 4.2400 NaN
50010 2019-07-23T14:45:00Z 20.4000 7.7000 9.1400 2.47 3.2400 1.2100 2.8200 20.3800 NaN ... 5.5700 4.35 131.3000 1.0400 4.8600 9.94 5.3400 2.7700 4.2400 NaN
50011 2019-07-23T15:00:00Z 20.4600 7.7000 9.1400 2.47 3.2300 1.2000 2.8300 20.3200 NaN ... 5.5600 4.34 131.8000 1.0500 4.8500 9.93 5.3300 2.7700 4.2400 NaN

50012 rows × 61 columns

df = allticks_data.drop('timestamp', axis = 1) df.mode(axis=0)

In [26]:
allticks_data.describe()
Out[26]:
AEFES AKBNK AKSA AKSEN ALARK ALBRK ANACM ARCLK ASELS ASUZU ... TTKOM TUKAS TUPRS USAK VAKBN VESTL YATAS YKBNK YUNSA ZOREN
count 48131.000000 49209.000000 48594.000000 48171.000000 48335.000000 46862.000000 48165.000000 49045.000000 48803.000000 48433.000000 ... 49077.000000 45929.000000 49143.000000 47659.000000 49212.000000 48781.000000 46055.000000 49225.000000 45528.000000 48807.000000
mean 20.982235 6.473105 7.127504 3.183542 2.060859 1.365549 1.672102 15.388088 13.432535 6.467033 ... 5.660680 1.737529 62.994535 1.220452 4.735438 5.942711 2.434249 2.566327 4.079695 1.248124
std 2.494002 0.944955 2.710033 0.724332 0.575943 0.167824 0.788365 4.531459 9.624246 2.201036 ... 0.818598 0.867095 32.398117 0.459532 0.977889 2.830465 2.552377 0.422774 1.347020 0.311330
min 0.000100 0.000100 0.000100 0.000000 0.000100 1.025500 0.000100 0.000100 0.000100 0.000100 ... 0.000100 0.650000 0.000100 0.000100 0.000100 0.000000 0.000100 0.000100 0.000100 0.000100
25% 19.160500 5.850000 5.208800 2.670000 1.568900 1.225100 1.047000 11.711100 4.940300 5.074800 ... 5.267300 1.060000 34.549100 0.957100 4.032200 4.020000 0.388600 2.268200 3.006700 1.033800
50% 20.646500 6.305700 6.985300 2.930000 1.937600 1.360200 1.259700 15.010000 9.275700 5.949600 ... 5.746400 1.530000 49.554200 1.050000 4.474200 6.320000 0.965800 2.609300 4.107800 1.250000
75% 22.732000 6.932500 8.720000 3.750000 2.421400 1.500000 2.402100 19.087700 22.756700 7.120000 ... 6.260000 2.130000 93.428700 1.370800 5.246000 7.450000 4.230000 2.874000 4.720600 1.426500
max 28.509000 9.212400 15.118900 5.190000 3.514300 2.190000 3.502100 26.427800 46.761600 15.280000 ... 7.350000 5.920000 139.293700 2.757800 7.581400 14.540000 10.674800 3.958100 9.527500 2.443000

8 rows × 60 columns

In [64]:
# Select all columns except the first column (timestamps)
columns_to_analyze = allticks_data.columns[1:]

# Calculate the variance, standard deviation, and coefficient of variation for each selected column
variances = allticks_data[columns_to_analyze].var()
std_deviations = allticks_data[columns_to_analyze].std()
means = allticks_data[columns_to_analyze].mean()
coeff_of_variation = (std_deviations / means) * 100  # Multiply by 100 to express as a percentage

# Create a new DataFrame to store the results
statistics_df = pd.DataFrame({
    'Column': columns_to_analyze,
    'Variance': variances,
    'Standard Deviation': std_deviations,
    'Coefficient of Variation': coeff_of_variation
})

# Display the calculated statistics
print(statistics_df)

# Save the statistics to an Excel file
statistics_df.to_excel('stats.xlsx', index=False)
      Column     Variance  Standard Deviation  Coefficient of Variation
AEFES  AEFES     6.220045            2.494002                 11.886255
AKBNK  AKBNK     0.892940            0.944955                 14.598175
AKSA    AKSA     7.344281            2.710033                 38.022191
AKSEN  AKSEN     0.524657            0.724332                 22.752397
ALARK  ALARK     0.331711            0.575943                 27.946760
ALBRK  ALBRK     0.028165            0.167824                 12.289822
ANACM  ANACM     0.621519            0.788365                 47.148149
ARCLK  ARCLK    20.534121            4.531459                 29.447837
ASELS  ASELS    92.626107            9.624246                 71.648765
ASUZU  ASUZU     4.844559            2.201036                 34.034707
AYGAZ  AYGAZ     6.814199            2.610402                 32.219438
BAGFS  BAGFS    13.090345            3.618058                 34.765198
BANVT  BANVT    39.278773            6.267278                 82.159003
BRISA  BRISA     1.677856            1.295321                 19.791314
CCOLA  CCOLA    45.524879            6.747213                 18.289735
CEMAS  CEMAS     0.639970            0.799981                 66.164017
ECILC  ECILC     0.948264            0.973788                 46.910015
EREGL  EREGL     7.240034            2.690731                 64.378583
FROTO  FROTO   217.051399           14.732664                 44.966434
GARAN  GARAN     1.561593            1.249637                 15.818726
GOODY  GOODY     0.785805            0.886456                 28.572461
GUBRF  GUBRF     1.495699            1.222988                 28.255460
HALKB  HALKB     9.434500            3.071563                 28.129534
ICBCT  ICBCT     3.203683            1.789883                 63.280257
ISCTR  ISCTR     1.006784            1.003386                 19.572350
ISDMR  ISDMR     2.882925            1.697918                 31.726922
ISFIN  ISFIN     3.114658            1.764839                113.172823
ISYAT  ISYAT     0.025679            0.160246                 29.822208
KAREL  KAREL     4.552330            2.133619                 67.136667
KARSN  KARSN     0.084340            0.290413                 21.886500
KCHOL  KCHOL    10.121585            3.181444                 25.974593
KRDMB  KRDMB     0.471124            0.686385                 30.879316
KRDMD  KRDMD     0.883774            0.940092                 53.160903
MGROS  MGROS    15.219903            3.901269                 19.928407
OTKAR  OTKAR   771.885345           27.782825                 34.123048
PARSN  PARSN    21.738631            4.662471                 56.330518
PETKM  PETKM     1.900291            1.378510                 54.288370
PGSUS  PGSUS    58.622534            7.656535                 30.886220
PRKME  PRKME     0.521210            0.721949                 24.664232
SAHOL  SAHOL     0.912617            0.955310                 11.087758
SASA    SASA     6.214720            2.492934                108.630461
SISE    SISE     2.024498            1.422848                 46.675752
SKBNK  SKBNK     0.086971            0.294908                 20.012042
SODA    SODA     4.185150            2.045764                 64.138753
TCELL  TCELL     5.551913            2.356250                 23.974859
THYAO  THYAO    16.219089            4.027293                 43.356341
TKFEN  TKFEN    44.449512            6.667047                 72.532480
TOASO  TOASO    40.046632            6.328241                 38.128192
TRKCM  TRKCM     1.209267            1.099667                 54.228227
TSKB    TSKB     0.024111            0.155276                 16.427277
TTKOM  TTKOM     0.670103            0.818598                 14.461129
TUKAS  TUKAS     0.751854            0.867095                 49.903932
TUPRS  TUPRS  1049.637996           32.398117                 51.430044
USAK    USAK     0.211170            0.459532                 37.652640
VAKBN  VAKBN     0.956266            0.977889                 20.650439
VESTL  VESTL     8.011531            2.830465                 47.629185
YATAS  YATAS     6.514629            2.552377                104.852755
YKBNK  YKBNK     0.178737            0.422774                 16.473878
YUNSA  YUNSA     1.814463            1.347020                 33.017664
ZOREN  ZOREN     0.096926            0.311330                 24.943838
In [38]:
# Assuming your DataFrame is named "allticks_data"

# Select all the columns containing stock prices (excluding the "timestamps" column)
stock_columns = allticks_data.columns[1:]  # Exclude the first column (timestamps)

# Create a box plot for each stock price column
for company in stock_columns:
    sns.set(style="whitegrid")  # Set the style of the plot (customize as needed)
    sns.boxplot(x=allticks_data[company])
    plt.xlabel("Price")
    plt.title(f"Box Plot of {company} Stock Price")
    plt.show()
In [42]:
# Select all the columns containing stock prices (excluding the "timestamps" column)
stock_columns = allticks_data.columns[1:]  # Exclude the first column (timestamps)

# Create a single combined box plot for all companies' stock prices
sns.set(style="whitegrid")  # Set the style of the plot (customize as needed)
sns.boxplot(data=allticks_data[stock_columns])
plt.xticks(rotation=90)  # Rotate x-axis labels for readability


plt.xlabel("Company")
plt.ylabel("Price")
plt.title("Combined Box Plot of Stock Prices for All Companies")
plt.show()

ALBRK, ASUZU, CEMAS, ICBCT, ISFIN, PARSN, TUKAS and some others have many outliers on the maximum side.

Outliers that are bigger than the maximum value of the whisker: The stock price increased or decreased significantly, basically it had smaller values most of the time but there are a few data that are significantly higher. There are many little values so that Q1 and Q3, thus, min and max whisker is calculated based on those values. Max value is calculated as a small value and all the bigger numbers are considered as an outlier.

ISDMR, TTKOM have many outliers on the minimum side. The opposite case of the maximum is valid in this case. However, those who have maximum side outliers are much more since the stock market is gaining popularity with time. The earlier values of all companies are small and similar in the beginning but they differ after some time.

In [65]:
# Select the columns containing stock prices (excluding the timestamp column)
stock_prices = allticks_data.iloc[:, 1:]

# Calculate the correlation matrix
correlation_matrix = stock_prices.corr()

# You now have a 60x60 correlation matrix
print(correlation_matrix)

correlation_matrix.to_excel('correlation_matrix.xlsx',index=False)
          AEFES     AKBNK      AKSA     AKSEN     ALARK     ALBRK     ANACM  \
AEFES  1.000000  0.267502 -0.000625  0.528379  0.319894  0.496981  0.117802   
AKBNK  0.267502  1.000000  0.572321  0.548420  0.657648  0.165933  0.439169   
AKSA  -0.000625  0.572321  1.000000  0.280772  0.636207 -0.045111  0.712541   
AKSEN  0.528379  0.548420  0.280772  1.000000  0.470681  0.283348  0.469720   
ALARK  0.319894  0.657648  0.636207  0.470681  1.000000  0.158375  0.736034   
ALBRK  0.496981  0.165933 -0.045111  0.283348  0.158375  1.000000  0.082215   
ANACM  0.117802  0.439169  0.712541  0.469720  0.736034  0.082215  1.000000   
ARCLK -0.335744  0.644439  0.735731  0.002287  0.464037 -0.243290  0.444435   
ASELS  0.032488  0.583853  0.844647  0.439348  0.752573 -0.052004  0.893538   
ASUZU  0.442354  0.587298  0.760313  0.594102  0.719034  0.318581  0.753061   
AYGAZ -0.039336  0.680857  0.899364  0.342756  0.704378 -0.087295  0.775918   
BAGFS  0.002867  0.030708 -0.273922  0.025920 -0.415153 -0.046602 -0.535003   
BANVT  0.161839  0.658539  0.776519  0.499313  0.865129 -0.046694  0.832482   
BRISA -0.227536  0.298082  0.478165 -0.141430 -0.058068  0.080278  0.051265   
CCOLA  0.479695 -0.083399 -0.418557 -0.079760 -0.129134  0.485198 -0.557608   
CEMAS  0.220689  0.480677  0.578001  0.670041  0.528057  0.020543  0.653769   
ECILC -0.028022  0.687260  0.893949  0.420829  0.728123 -0.124651  0.788193   
EREGL  0.022334  0.427050  0.830970  0.336232  0.657187 -0.010935  0.931368   
FROTO  0.014181  0.417780  0.817036  0.308924  0.654174  0.014792  0.927516   
GARAN  0.330771  0.936752  0.630530  0.524864  0.725833  0.265882  0.576837   
GOODY -0.148720  0.536148  0.567748  0.153148  0.282585 -0.034644  0.346986   
GUBRF -0.223603  0.133821  0.158594 -0.196855 -0.397135 -0.085523 -0.440147   
HALKB  0.434685  0.117774 -0.532093  0.227179 -0.151374  0.375976 -0.593122   
ICBCT  0.090316  0.331142  0.652518  0.312171  0.531813  0.020797  0.779935   
ISCTR  0.304757  0.909713  0.642620  0.519119  0.753016  0.212024  0.519361   
ISDMR  0.297087 -0.299530  0.347700  0.243702  0.307496  0.294577  0.860045   
ISFIN -0.127835  0.069228  0.389008 -0.072826  0.407486  0.166525  0.732345   
ISYAT -0.189416  0.285470  0.702517 -0.080503  0.587087 -0.065670  0.788362   
KAREL  0.181108  0.561646  0.790006  0.474214  0.813202  0.138252  0.934658   
KARSN  0.279880  0.495951  0.669682  0.426398  0.388063  0.367376  0.504093   
KCHOL -0.143505  0.623556  0.838016  0.207000  0.658166 -0.050933  0.799675   
KRDMB  0.549010  0.065148  0.276477  0.264686  0.451875  0.477526  0.252403   
KRDMD  0.292833  0.331925  0.685565  0.484434  0.518801  0.208807  0.799184   
MGROS  0.412668  0.766520  0.291698  0.600075  0.403851  0.201810  0.037760   
OTKAR -0.368310  0.577816  0.737121 -0.026571  0.453374 -0.213952  0.476567   
PARSN -0.075655  0.342925  0.736904  0.303896  0.467247 -0.094028  0.820097   
PETKM -0.016443  0.620385  0.865281  0.374754  0.727185 -0.057930  0.884786   
PGSUS  0.518629  0.102545  0.155788  0.216294  0.548264  0.476764  0.325378   
PRKME  0.624415  0.432895  0.006464  0.696566  0.499550  0.329030  0.162587   
SAHOL  0.421908  0.874744  0.365626  0.473875  0.492877  0.279617  0.139630   
SASA   0.064751  0.345486  0.720528  0.375461  0.650567  0.050585  0.963165   
SISE  -0.139481  0.392096  0.765385  0.177282  0.614721 -0.001287  0.911225   
SKBNK  0.529226 -0.026489 -0.339876  0.076806 -0.189774  0.535323 -0.559730   
SODA  -0.255682  0.249245  0.699090  0.068726  0.502520 -0.115282  0.859063   
TCELL  0.086501  0.452946  0.818100  0.089541  0.622756  0.175730  0.727762   
THYAO  0.243128  0.258007  0.673206  0.395772  0.550375  0.279413  0.851014   
TKFEN -0.018680  0.260238  0.555490  0.212043  0.633635  0.052638  0.927872   
TOASO -0.165691  0.673631  0.895798  0.256923  0.586738 -0.221665  0.622388   
TRKCM  0.091527  0.505256  0.852854  0.411484  0.732022  0.065874  0.927069   
TSKB   0.333290  0.856766  0.544898  0.590779  0.427739  0.182478  0.209076   
TTKOM  0.255756  0.578765  0.131398  0.211780  0.080230  0.235083 -0.306898   
TUKAS -0.204048  0.315863  0.561769 -0.077261  0.498251  0.024473  0.589974   
TUPRS -0.100480  0.384826  0.732025  0.235223  0.655195 -0.018337  0.928284   
USAK   0.083471  0.785438  0.728294  0.581016  0.652558 -0.075049  0.552056   
VAKBN  0.397543  0.942930  0.512615  0.606355  0.653074  0.304312  0.346859   
VESTL -0.150159  0.484747  0.776674  0.136128  0.509264 -0.016297  0.741402   
YATAS  0.235401  0.593489  0.821989  0.456702  0.791923  0.081669  0.885328   
YKBNK  0.538889  0.516761 -0.169104  0.375938  0.161212  0.434805 -0.314139   
YUNSA  0.475578  0.083837  0.394182  0.451502  0.374540  0.460184  0.565684   
ZOREN  0.061791  0.627474  0.701427  0.371093  0.391374  0.193623  0.515836   

          ARCLK     ASELS     ASUZU  ...     TTKOM     TUKAS     TUPRS  \
AEFES -0.335744  0.032488  0.442354  ...  0.255756 -0.204048 -0.100480   
AKBNK  0.644439  0.583853  0.587298  ...  0.578765  0.315863  0.384826   
AKSA   0.735731  0.844647  0.760313  ...  0.131398  0.561769  0.732025   
AKSEN  0.002287  0.439348  0.594102  ...  0.211780 -0.077261  0.235223   
ALARK  0.464037  0.752573  0.719034  ...  0.080230  0.498251  0.655195   
ALBRK -0.243290 -0.052004  0.318581  ...  0.235083  0.024473 -0.018337   
ANACM  0.444435  0.893538  0.753061  ... -0.306898  0.589974  0.928284   
ARCLK  1.000000  0.645008  0.332156  ...  0.238665  0.528223  0.570077   
ASELS  0.645008  1.000000  0.779175  ... -0.125976  0.528841  0.893495   
ASUZU  0.332156  0.779175  1.000000  ...  0.114170  0.377230  0.636567   
AYGAZ  0.832983  0.929203  0.692950  ...  0.051660  0.507003  0.813354   
BAGFS -0.145831 -0.403790 -0.308348  ...  0.453879 -0.262078 -0.512008   
BANVT  0.584277  0.936341  0.781870  ... -0.028089  0.490990  0.808954   
BRISA  0.490160  0.251358  0.289053  ...  0.378291  0.341308  0.204787   
CCOLA -0.403100 -0.564035 -0.198823  ...  0.404175 -0.462148 -0.635989   
CEMAS  0.305539  0.689888  0.656293  ... -0.037923  0.071991  0.494005   
ECILC  0.800039  0.939423  0.707779  ...  0.054373  0.480125  0.795279   
EREGL  0.562972  0.925961  0.774213  ... -0.253603  0.588243  0.929331   
FROTO  0.549083  0.909404  0.758415  ... -0.251237  0.640170  0.943490   
GARAN  0.585013  0.670082  0.721524  ...  0.446646  0.464070  0.535912   
GOODY  0.676047  0.559465  0.382466  ...  0.189484  0.267082  0.439865   
GUBRF  0.270226 -0.178222 -0.120396  ...  0.610556 -0.025786 -0.291051   
HALKB -0.425135 -0.559875 -0.276755  ...  0.533515 -0.528944 -0.694929   
ICBCT  0.394651  0.836779  0.666557  ... -0.294283  0.375011  0.808694   
ISCTR  0.600099  0.608127  0.665832  ...  0.514890  0.435206  0.463898   
ISDMR -0.567658  0.489981  0.530153  ... -0.450385  0.251758  0.735507   
ISFIN  0.322790  0.547116  0.367441  ... -0.463556  0.690668  0.802626   
ISYAT  0.609967  0.719683  0.525800  ... -0.297996  0.796818  0.875561   
KAREL  0.508408  0.894148  0.818012  ... -0.109278  0.660567  0.882167   
KARSN  0.267356  0.565763  0.815915  ...  0.289611  0.385659  0.455140   
KCHOL  0.830299  0.878205  0.636624  ... -0.012342  0.722824  0.886318   
KRDMB -0.175953  0.201025  0.535803  ...  0.055775 -0.052234  0.105183   
KRDMD  0.250640  0.763269  0.837706  ... -0.187339  0.319795  0.717513   
MGROS  0.282331  0.254276  0.391553  ...  0.739648 -0.084991 -0.055947   
OTKAR  0.917773  0.636471  0.376711  ...  0.195679  0.584642  0.586448   
PARSN  0.514000  0.819982  0.628660  ... -0.315603  0.438614  0.820912   
PETKM  0.732698  0.963461  0.745418  ... -0.087614  0.596628  0.891300   
PGSUS -0.309820  0.170161  0.478515  ... -0.027542  0.276271  0.211858   
PRKME -0.208820  0.181213  0.355657  ...  0.323079 -0.120132 -0.018191   
SAHOL  0.461832  0.278019  0.425178  ...  0.762092  0.096913  0.059683   
SASA   0.426195  0.877541  0.734678  ... -0.368041  0.588859  0.935314   
SISE   0.648147  0.879780  0.631260  ... -0.305630  0.702447  0.962189   
SKBNK -0.485077 -0.482530 -0.032653  ...  0.498479 -0.440434 -0.640367   
SODA   0.600288  0.813426  0.511613  ... -0.422378  0.694059  0.950182   
TCELL  0.588828  0.783621  0.740609  ... -0.013502  0.653890  0.780493   
THYAO  0.208139  0.766054  0.818347  ... -0.276484  0.469554  0.792173   
TKFEN  0.376929  0.778524  0.586514  ... -0.471644  0.687804  0.936246   
TOASO  0.894959  0.838014  0.604371  ...  0.189400  0.482214  0.684919   
TRKCM  0.583270  0.941957  0.825733  ... -0.163798  0.536308  0.892095   
TSKB   0.500843  0.429668  0.563091  ...  0.737490  0.083406  0.131547   
TTKOM  0.238665 -0.125976  0.114170  ...  1.000000 -0.047513 -0.337725   
TUKAS  0.528223  0.528841  0.377230  ... -0.047513  1.000000  0.731186   
TUPRS  0.570077  0.893495  0.636567  ... -0.337725  0.731186  1.000000   
USAK   0.651974  0.757735  0.613202  ...  0.339795  0.316385  0.531317   
VAKBN  0.524931  0.489668  0.598292  ...  0.647927  0.200089  0.255812   
VESTL  0.673333  0.747866  0.660330  ... -0.039156  0.716806  0.793732   
YATAS  0.526898  0.931239  0.849296  ... -0.053002  0.539603  0.838108   
YKBNK -0.119385 -0.259706  0.105741  ...  0.752652 -0.242727 -0.442756   
YUNSA -0.186253  0.420727  0.696278  ... -0.167863  0.126451  0.416737   
ZOREN  0.545555  0.635820  0.720764  ...  0.326075  0.377858  0.500383   

           USAK     VAKBN     VESTL     YATAS     YKBNK     YUNSA     ZOREN  
AEFES  0.083471  0.397543 -0.150159  0.235401  0.538889  0.475578  0.061791  
AKBNK  0.785438  0.942930  0.484747  0.593489  0.516761  0.083837  0.627474  
AKSA   0.728294  0.512615  0.776674  0.821989 -0.169104  0.394182  0.701427  
AKSEN  0.581016  0.606355  0.136128  0.456702  0.375938  0.451502  0.371093  
ALARK  0.652558  0.653074  0.509264  0.791923  0.161212  0.374540  0.391374  
ALBRK -0.075049  0.304312 -0.016297  0.081669  0.434805  0.460184  0.193623  
ANACM  0.552056  0.346859  0.741402  0.885328 -0.314139  0.565684  0.515836  
ARCLK  0.651974  0.524931  0.673333  0.526898 -0.119385 -0.186253  0.545555  
ASELS  0.757735  0.489668  0.747866  0.931239 -0.259706  0.420727  0.635820  
ASUZU  0.613202  0.598292  0.660330  0.849296  0.105741  0.696278  0.720764  
AYGAZ  0.809034  0.592549  0.758866  0.836522 -0.169948  0.257124  0.659939  
BAGFS -0.006847  0.070047 -0.271655 -0.421186  0.405499 -0.373769 -0.007022  
BANVT  0.798737  0.587479  0.669037  0.898647 -0.075599  0.371989  0.566044  
BRISA  0.264396  0.240770  0.537781  0.154711  0.018329  0.094437  0.666418  
CCOLA -0.342572  0.106403 -0.556101 -0.478281  0.618194  0.048334 -0.297058  
CEMAS  0.631888  0.451906  0.412297  0.654709 -0.030140  0.472474  0.479668  
ECILC  0.845493  0.604237  0.749101  0.853425 -0.153317  0.279614  0.662133  
EREGL  0.579126  0.324807  0.843051  0.891983 -0.363152  0.582277  0.623334  
FROTO  0.531192  0.301170  0.821596  0.886083 -0.369152  0.565362  0.615279  
GARAN  0.735682  0.889723  0.591568  0.716645  0.416988  0.274365  0.684231  
GOODY  0.553901  0.449401  0.590861  0.428813 -0.003119  0.012931  0.626049  
GUBRF  0.119931  0.123224  0.021861 -0.283812  0.181517 -0.308403  0.273674  
HALKB -0.085364  0.289094 -0.663312 -0.526251  0.796584 -0.182357 -0.316223  
ICBCT  0.506300  0.234724  0.587006  0.765115 -0.362153  0.540416  0.503495  
ISCTR  0.785017  0.915960  0.538922  0.644241  0.465772  0.248998  0.585588  
ISDMR -0.172702 -0.284470  0.470635  0.521031 -0.307184  0.747014  0.105374  
ISFIN  0.080190 -0.015633  0.638451  0.576695 -0.497536  0.347318  0.262856  
ISYAT  0.304051  0.142741  0.806164  0.707975 -0.450457  0.337598  0.455052  
KAREL  0.647139  0.502028  0.779622  0.933984 -0.146574  0.530879  0.594751  
KARSN  0.488105  0.493382  0.615618  0.641713  0.134530  0.569537  0.781788  
KCHOL  0.659195  0.484400  0.866697  0.794275 -0.234732  0.233001  0.669182  
KRDMB  0.089792  0.224270  0.052380  0.378105  0.197284  0.706336  0.160795  
KRDMD  0.445333  0.300068  0.688775  0.790928 -0.175728  0.809497  0.630392  
MGROS  0.700536  0.839945  0.056260  0.236725  0.722851  0.053126  0.403012  
OTKAR  0.550614  0.452672  0.720430  0.531581 -0.169528 -0.114003  0.582256  
PARSN  0.494088  0.197149  0.735754  0.749385 -0.428178  0.458106  0.544246  
PETKM  0.724798  0.503126  0.810474  0.915139 -0.263232  0.345646  0.680342  
PGSUS  0.019848  0.170019  0.093652  0.366702  0.241310  0.590951  0.093774  
PRKME  0.424865  0.546548 -0.186046  0.283737  0.565032  0.247195  0.057171  
SAHOL  0.566766  0.897199  0.220474  0.320306  0.734385  0.036501  0.461139  
SASA   0.501127  0.249046  0.749320  0.863806 -0.404375  0.606613  0.511793  
SISE   0.499419  0.263305  0.841056  0.807107 -0.453436  0.420605  0.574375  
SKBNK -0.220973  0.168601 -0.505531 -0.347009  0.653931  0.065890 -0.084631  
SODA   0.415936  0.103986  0.799403  0.709057 -0.580021  0.365282  0.463037  
TCELL  0.454743  0.383525  0.753048  0.797806 -0.207289  0.512561  0.648446  
THYAO  0.360347  0.210835  0.643835  0.801843 -0.305493  0.777343  0.574916  
TKFEN  0.320969  0.131223  0.719123  0.767302 -0.447145  0.486685  0.384886  
TOASO  0.791129  0.573100  0.759496  0.744917 -0.134525  0.093819  0.686027  
TRKCM  0.634915  0.433015  0.812177  0.911697 -0.267388  0.604393  0.667266  
TSKB   0.731866  0.880894  0.347174  0.453043  0.574964  0.090291  0.668327  
TTKOM  0.339795  0.647927 -0.039156 -0.053002  0.752652 -0.167863  0.326075  
TUKAS  0.316385  0.200089  0.716806  0.539603 -0.242727  0.126451  0.377858  
TUPRS  0.531317  0.255812  0.793732  0.838108 -0.442756  0.416737  0.500383  
USAK   1.000000  0.755431  0.493929  0.661793  0.175051  0.143049  0.564730  
VAKBN  0.755431  1.000000  0.357053  0.532766  0.634247  0.140449  0.564502  
VESTL  0.493929  0.357053  1.000000  0.750194 -0.223108  0.355600  0.759132  
YATAS  0.661793  0.532766  0.750194  1.000000 -0.132810  0.518503  0.645384  
YKBNK  0.175051  0.634247 -0.223108 -0.132810  1.000000 -0.017644  0.110198  
YUNSA  0.143049  0.140449  0.355600  0.518503 -0.017644  1.000000  0.378837  
ZOREN  0.564730  0.564502  0.759132  0.645384  0.110198  0.378837  1.000000  

[60 rows x 60 columns]
---------------------------------------------------------------------------
PermissionError                           Traceback (most recent call last)
Cell In[65], line 10
      7 # You now have a 60x60 correlation matrix
      8 print(correlation_matrix)
---> 10 correlation_matrix.to_excel('correlation_matrix.xlsx',index=False)

File ~\anaconda3\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
    209     else:
    210         kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File ~\anaconda3\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
    209     else:
    210         kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File ~\anaconda3\lib\site-packages\pandas\core\generic.py:2374, in NDFrame.to_excel(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, encoding, inf_rep, verbose, freeze_panes, storage_options)
   2361 from pandas.io.formats.excel import ExcelFormatter
   2363 formatter = ExcelFormatter(
   2364     df,
   2365     na_rep=na_rep,
   (...)
   2372     inf_rep=inf_rep,
   2373 )
-> 2374 formatter.write(
   2375     excel_writer,
   2376     sheet_name=sheet_name,
   2377     startrow=startrow,
   2378     startcol=startcol,
   2379     freeze_panes=freeze_panes,
   2380     engine=engine,
   2381     storage_options=storage_options,
   2382 )

File ~\anaconda3\lib\site-packages\pandas\io\formats\excel.py:944, in ExcelFormatter.write(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options)
    940     need_save = False
    941 else:
    942     # error: Cannot instantiate abstract class 'ExcelWriter' with abstract
    943     # attributes 'engine', 'save', 'supported_extensions' and 'write_cells'
--> 944     writer = ExcelWriter(  # type: ignore[abstract]
    945         writer, engine=engine, storage_options=storage_options
    946     )
    947     need_save = True
    949 try:

File ~\anaconda3\lib\site-packages\pandas\io\excel\_openpyxl.py:60, in OpenpyxlWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)
     56 from openpyxl.workbook import Workbook
     58 engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
---> 60 super().__init__(
     61     path,
     62     mode=mode,
     63     storage_options=storage_options,
     64     if_sheet_exists=if_sheet_exists,
     65     engine_kwargs=engine_kwargs,
     66 )
     68 # ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from
     69 # the file and later write to it
     70 if "r+" in self._mode:  # Load from existing workbook

File ~\anaconda3\lib\site-packages\pandas\io\excel\_base.py:1313, in ExcelWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)
   1309 self._handles = IOHandles(
   1310     cast(IO[bytes], path), compression={"compression": None}
   1311 )
   1312 if not isinstance(path, ExcelWriter):
-> 1313     self._handles = get_handle(
   1314         path, mode, storage_options=storage_options, is_text=False
   1315     )
   1316 self._cur_sheet = None
   1318 if date_format is None:

File ~\anaconda3\lib\site-packages\pandas\io\common.py:865, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    856         handle = open(
    857             handle,
    858             ioargs.mode,
   (...)
    861             newline="",
    862         )
    863     else:
    864         # Binary mode
--> 865         handle = open(handle, ioargs.mode)
    866     handles.append(handle)
    868 # Convert BytesIO or file objects passed with an encoding

PermissionError: [Errno 13] Permission denied: 'correlation_matrix.xlsx'
In [67]:
#highest correlations


# Flatten the upper triangular part of the correlation matrix, excluding the diagonal
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Sort the values in descending order
sorted_values = upper_triangle.unstack().sort_values(ascending=False)

# Select the top values
top_values = sorted_values[0:60]

# Get the corresponding row and column names
top_corr_pairs = top_values.index
top_corr_values = top_values.values

# Print the results
for pair, value in zip(top_corr_pairs, top_corr_values):
    row, col = pair
    print(f"Correlation between {row} and {col}: {value:.4f}")
Correlation between TRKCM and EREGL: 0.9740
Correlation between FROTO and EREGL: 0.9725
Correlation between ECILC and AYGAZ: 0.9711
Correlation between SODA and SISE: 0.9702
Correlation between PETKM and ASELS: 0.9635
Correlation between SASA and ANACM: 0.9632
Correlation between TUPRS and SISE: 0.9622
Correlation between SASA and FROTO: 0.9560
Correlation between SASA and EREGL: 0.9542
Correlation between TUPRS and SODA: 0.9502
Correlation between PETKM and AYGAZ: 0.9472
Correlation between TRKCM and FROTO: 0.9462
Correlation between TUPRS and FROTO: 0.9435
Correlation between VAKBN and AKBNK: 0.9429
Correlation between TRKCM and ASELS: 0.9420
Correlation between PETKM and ECILC: 0.9418
Correlation between ECILC and ASELS: 0.9394
Correlation between TOASO and AYGAZ: 0.9374
Correlation between SISE and FROTO: 0.9371
Correlation between GARAN and AKBNK: 0.9368
Correlation between BANVT and ASELS: 0.9363
Correlation between TUPRS and TKFEN: 0.9362
Correlation between TUPRS and SASA: 0.9353
Correlation between TOASO and ECILC: 0.9351
Correlation between KAREL and ANACM: 0.9347
Correlation between SISE and EREGL: 0.9343
Correlation between YATAS and KAREL: 0.9340
Correlation between ISCTR and GARAN: 0.9322
Correlation between SASA and ISDMR: 0.9321
Correlation between EREGL and ANACM: 0.9314
Correlation between TKFEN and SASA: 0.9313
Correlation between YATAS and ASELS: 0.9312
Correlation between TUPRS and EREGL: 0.9293
Correlation between TRKCM and PETKM: 0.9292
Correlation between AYGAZ and ASELS: 0.9292
Correlation between TUPRS and ANACM: 0.9283
Correlation between TKFEN and ANACM: 0.9279
Correlation between FROTO and ANACM: 0.9275
Correlation between TRKCM and ANACM: 0.9271
Correlation between EREGL and ASELS: 0.9260
Correlation between TRKCM and SASA: 0.9253
Correlation between PETKM and KCHOL: 0.9239
Correlation between KCHOL and AYGAZ: 0.9233
Correlation between TRKCM and SISE: 0.9191
Correlation between OTKAR and ARCLK: 0.9178
Correlation between SISE and KCHOL: 0.9163
Correlation between VAKBN and ISCTR: 0.9160
Correlation between YATAS and PETKM: 0.9151
Correlation between PETKM and EREGL: 0.9141
Correlation between SISE and SASA: 0.9134
Correlation between YATAS and TRKCM: 0.9117
Correlation between SISE and ANACM: 0.9112
Correlation between ISCTR and AKBNK: 0.9097
Correlation between TRKCM and KAREL: 0.9096
Correlation between TKFEN and SISE: 0.9095
Correlation between FROTO and ASELS: 0.9094
Correlation between PETKM and FROTO: 0.9089
Correlation between ISDMR and FROTO: 0.9070
Correlation between SISE and PETKM: 0.9068
Correlation between SASA and KAREL: 0.9046
In [68]:
#I selected these two because they had the highest coefficients of variations

# Select the specified columns
selected_columns = ["ISFIN", "SASA"]

# Initialize a list to store correlation values and corresponding timestamps
correlations_with_timestamps = []

# Calculate the correlation and create and display separate plots for each set of 750 rows
num_sets = len(allticks_data) // 750

for i in range(num_sets):
    start_row = i * 750
    end_row = (i + 1) * 750
    period_data = allticks_data[selected_columns][start_row:end_row]
    
    # Drop rows with missing values
    period_data.dropna(inplace=True)
    
    # Extract the data for each column
    x = period_data["ISFIN"]
    y = period_data["SASA"]
    
    # Calculate the correlation coefficient and correlation
    correlation_coefficient = np.corrcoef(x, y)[0, 1]
    correlation = np.corrcoef(x, y)[0, 1]  # Calculate the correlation itself
    
    # Get the start and end timestamps
    start_timestamp = allticks_data["timestamp"].iloc[start_row]
    end_timestamp = allticks_data["timestamp"].iloc[end_row - 1]
    
    # Create a scatter plot for the period
    plt.figure(figsize=(8, 6))
    plt.scatter(x, y, label="Data Points", color="blue")
    
    # Set labels and title
    plt.xlabel("ISFIN")
    plt.ylabel("SASA")
    plt.title(f"Scatter Plot for Rows {start_row+1}-{end_row}")
    
    # Add text to display the correlation coefficient and correlation
    plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)
    plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)
    
    # Display the plot
    plt.legend(loc="upper right")
    plt.show()
    
    # Determine the correlation sign based on the correlation coefficient
    correlation_sign = "positive" if correlation_coefficient > 0 else "negative"
    
    # Add the correlation and timestamps to the list
    correlations_with_timestamps.append({
        "Correlation": correlation,
        "Correlation Sign": correlation_sign,
        "Start Timestamp": start_timestamp,
        "End Timestamp": end_timestamp,
        "Start Row": start_row,
        "End Row": end_row
    })

# Display the list of correlation values and timestamps
for item in correlations_with_timestamps:
    print(f"Correlation: {item['Correlation']:.2f}, Sign: {item['Correlation Sign']}, Start Timestamp: {item['Start Timestamp']}, End Timestamp: {item['End Timestamp']}, Start Row: {item['Start Row']}, End Row: {item['End Row']}")
Correlation: -0.12, Sign: negative, Start Timestamp: 2012-09-17T06:45:00Z, End Timestamp: 2012-10-31T10:00:00Z, Start Row: 0, End Row: 750
Correlation: 0.71, Sign: positive, Start Timestamp: 2012-10-31T10:15:00Z, End Timestamp: 2012-12-11T09:00:00Z, Start Row: 750, End Row: 1500
Correlation: 0.56, Sign: positive, Start Timestamp: 2012-12-11T09:15:00Z, End Timestamp: 2013-01-22T08:45:00Z, Start Row: 1500, End Row: 2250
Correlation: 0.80, Sign: positive, Start Timestamp: 2013-01-22T09:00:00Z, End Timestamp: 2013-03-04T07:45:00Z, Start Row: 2250, End Row: 3000
Correlation: -0.15, Sign: negative, Start Timestamp: 2013-03-04T08:00:00Z, End Timestamp: 2013-04-11T13:15:00Z, Start Row: 3000, End Row: 3750
Correlation: -0.16, Sign: negative, Start Timestamp: 2013-04-11T13:30:00Z, End Timestamp: 2013-05-23T11:45:00Z, Start Row: 3750, End Row: 4500
Correlation: 0.87, Sign: positive, Start Timestamp: 2013-05-23T12:00:00Z, End Timestamp: 2013-07-03T08:45:00Z, Start Row: 4500, End Row: 5250
Correlation: -0.04, Sign: negative, Start Timestamp: 2013-07-03T09:00:00Z, End Timestamp: 2013-08-16T08:00:00Z, Start Row: 5250, End Row: 6000
Correlation: 0.94, Sign: positive, Start Timestamp: 2013-08-16T08:15:00Z, End Timestamp: 2013-09-26T12:15:00Z, Start Row: 6000, End Row: 6750
Correlation: 0.94, Sign: positive, Start Timestamp: 2013-09-26T12:30:00Z, End Timestamp: 2013-11-13T13:00:00Z, Start Row: 6750, End Row: 7500
Correlation: 0.37, Sign: positive, Start Timestamp: 2013-11-13T13:15:00Z, End Timestamp: 2013-12-23T10:15:00Z, Start Row: 7500, End Row: 8250
Correlation: 0.10, Sign: positive, Start Timestamp: 2013-12-23T12:00:00Z, End Timestamp: 2014-01-31T10:00:00Z, Start Row: 8250, End Row: 9000
Correlation: 0.14, Sign: positive, Start Timestamp: 2014-01-31T10:15:00Z, End Timestamp: 2014-03-12T08:45:00Z, Start Row: 9000, End Row: 9750
Correlation: 0.85, Sign: positive, Start Timestamp: 2014-03-12T09:00:00Z, End Timestamp: 2014-04-21T12:15:00Z, Start Row: 9750, End Row: 10500
Correlation: 0.44, Sign: positive, Start Timestamp: 2014-04-21T12:30:00Z, End Timestamp: 2014-06-04T12:45:00Z, Start Row: 10500, End Row: 11250
Correlation: 0.19, Sign: positive, Start Timestamp: 2014-06-04T13:00:00Z, End Timestamp: 2014-07-14T08:00:00Z, Start Row: 11250, End Row: 12000
Correlation: 0.36, Sign: positive, Start Timestamp: 2014-07-14T08:15:00Z, End Timestamp: 2014-08-25T12:15:00Z, Start Row: 12000, End Row: 12750
Correlation: -0.75, Sign: negative, Start Timestamp: 2014-08-25T12:30:00Z, End Timestamp: 2014-10-01T13:00:00Z, Start Row: 12750, End Row: 13500
Correlation: 0.21, Sign: positive, Start Timestamp: 2014-10-01T13:15:00Z, End Timestamp: 2014-11-14T08:15:00Z, Start Row: 13500, End Row: 14250
Correlation: 0.56, Sign: positive, Start Timestamp: 2014-11-14T08:30:00Z, End Timestamp: 2014-12-23T08:30:00Z, Start Row: 14250, End Row: 15000
Correlation: 0.79, Sign: positive, Start Timestamp: 2014-12-23T08:45:00Z, End Timestamp: 2015-01-30T10:00:00Z, Start Row: 15000, End Row: 15750
Correlation: -0.02, Sign: negative, Start Timestamp: 2015-01-30T10:15:00Z, End Timestamp: 2015-03-10T13:30:00Z, Start Row: 15750, End Row: 16500
Correlation: 0.60, Sign: positive, Start Timestamp: 2015-03-10T13:45:00Z, End Timestamp: 2015-04-16T12:15:00Z, Start Row: 16500, End Row: 17250
Correlation: 0.21, Sign: positive, Start Timestamp: 2015-04-16T12:30:00Z, End Timestamp: 2015-05-28T13:15:00Z, Start Row: 17250, End Row: 18000
Correlation: -0.64, Sign: negative, Start Timestamp: 2015-05-28T13:30:00Z, End Timestamp: 2015-07-07T06:30:00Z, Start Row: 18000, End Row: 18750
Correlation: 0.49, Sign: positive, Start Timestamp: 2015-07-07T06:45:00Z, End Timestamp: 2015-08-14T11:30:00Z, Start Row: 18750, End Row: 19500
Correlation: -0.27, Sign: negative, Start Timestamp: 2015-08-14T11:45:00Z, End Timestamp: 2015-09-22T13:00:00Z, Start Row: 19500, End Row: 20250
Correlation: -0.24, Sign: negative, Start Timestamp: 2015-09-22T13:15:00Z, End Timestamp: 2015-11-09T09:30:00Z, Start Row: 20250, End Row: 21000
Correlation: 0.91, Sign: positive, Start Timestamp: 2015-11-09T09:45:00Z, End Timestamp: 2015-12-15T09:30:00Z, Start Row: 21000, End Row: 21750
Correlation: 0.61, Sign: positive, Start Timestamp: 2015-12-15T09:45:00Z, End Timestamp: 2016-01-20T08:15:00Z, Start Row: 21750, End Row: 22500
Correlation: 0.50, Sign: positive, Start Timestamp: 2016-01-20T08:30:00Z, End Timestamp: 2016-02-23T14:45:00Z, Start Row: 22500, End Row: 23250
Correlation: 0.89, Sign: positive, Start Timestamp: 2016-02-23T15:00:00Z, End Timestamp: 2016-03-29T10:45:00Z, Start Row: 23250, End Row: 24000
Correlation: 0.89, Sign: positive, Start Timestamp: 2016-03-29T11:00:00Z, End Timestamp: 2016-05-03T07:00:00Z, Start Row: 24000, End Row: 24750
Correlation: 0.82, Sign: positive, Start Timestamp: 2016-05-03T07:15:00Z, End Timestamp: 2016-06-07T13:30:00Z, Start Row: 24750, End Row: 25500
Correlation: 0.42, Sign: positive, Start Timestamp: 2016-06-07T13:45:00Z, End Timestamp: 2016-07-18T08:00:00Z, Start Row: 25500, End Row: 26250
Correlation: -0.54, Sign: negative, Start Timestamp: 2016-07-18T08:15:00Z, End Timestamp: 2016-08-19T14:30:00Z, Start Row: 26250, End Row: 27000
Correlation: 0.54, Sign: positive, Start Timestamp: 2016-08-22T06:30:00Z, End Timestamp: 2016-09-30T13:45:00Z, Start Row: 27000, End Row: 27750
Correlation: -0.36, Sign: negative, Start Timestamp: 2016-09-30T14:00:00Z, End Timestamp: 2016-11-07T07:45:00Z, Start Row: 27750, End Row: 28500
Correlation: 0.19, Sign: positive, Start Timestamp: 2016-11-07T08:00:00Z, End Timestamp: 2016-12-09T08:45:00Z, Start Row: 28500, End Row: 29250
Correlation: 0.15, Sign: positive, Start Timestamp: 2016-12-09T09:00:00Z, End Timestamp: 2017-01-12T08:15:00Z, Start Row: 29250, End Row: 30000
Correlation: -0.13, Sign: negative, Start Timestamp: 2017-01-12T08:30:00Z, End Timestamp: 2017-02-15T07:30:00Z, Start Row: 30000, End Row: 30750
Correlation: 0.28, Sign: positive, Start Timestamp: 2017-02-15T07:45:00Z, End Timestamp: 2017-03-21T07:30:00Z, Start Row: 30750, End Row: 31500
Correlation: 0.38, Sign: positive, Start Timestamp: 2017-03-21T07:45:00Z, End Timestamp: 2017-04-21T14:15:00Z, Start Row: 31500, End Row: 32250
Correlation: 0.04, Sign: positive, Start Timestamp: 2017-04-21T14:30:00Z, End Timestamp: 2017-05-29T12:30:00Z, Start Row: 32250, End Row: 33000
Correlation: 0.28, Sign: positive, Start Timestamp: 2017-05-29T12:45:00Z, End Timestamp: 2017-07-04T11:15:00Z, Start Row: 33000, End Row: 33750
Correlation: 0.88, Sign: positive, Start Timestamp: 2017-07-04T11:30:00Z, End Timestamp: 2017-08-07T08:30:00Z, Start Row: 33750, End Row: 34500
Correlation: -0.21, Sign: negative, Start Timestamp: 2017-08-07T08:45:00Z, End Timestamp: 2017-09-13T12:45:00Z, Start Row: 34500, End Row: 35250
Correlation: -0.13, Sign: negative, Start Timestamp: 2017-09-13T13:00:00Z, End Timestamp: 2017-10-17T12:00:00Z, Start Row: 35250, End Row: 36000
Correlation: -0.16, Sign: negative, Start Timestamp: 2017-10-17T12:15:00Z, End Timestamp: 2017-11-20T09:45:00Z, Start Row: 36000, End Row: 36750
Correlation: 0.64, Sign: positive, Start Timestamp: 2017-11-20T10:45:00Z, End Timestamp: 2017-12-22T08:00:00Z, Start Row: 36750, End Row: 37500
Correlation: 0.07, Sign: positive, Start Timestamp: 2017-12-22T08:15:00Z, End Timestamp: 2018-01-25T13:30:00Z, Start Row: 37500, End Row: 38250
Correlation: -0.39, Sign: negative, Start Timestamp: 2018-01-25T13:45:00Z, End Timestamp: 2018-02-28T12:30:00Z, Start Row: 38250, End Row: 39000
Correlation: 0.75, Sign: positive, Start Timestamp: 2018-02-28T12:45:00Z, End Timestamp: 2018-04-03T12:00:00Z, Start Row: 39000, End Row: 39750
Correlation: 0.10, Sign: positive, Start Timestamp: 2018-04-03T12:15:00Z, End Timestamp: 2018-05-09T09:15:00Z, Start Row: 39750, End Row: 40500
Correlation: -0.41, Sign: negative, Start Timestamp: 2018-05-09T09:30:00Z, End Timestamp: 2018-06-12T07:45:00Z, Start Row: 40500, End Row: 41250
Correlation: 0.16, Sign: positive, Start Timestamp: 2018-06-12T08:00:00Z, End Timestamp: 2018-07-17T10:45:00Z, Start Row: 41250, End Row: 42000
Correlation: 0.32, Sign: positive, Start Timestamp: 2018-07-17T11:00:00Z, End Timestamp: 2018-08-17T14:15:00Z, Start Row: 42000, End Row: 42750
Correlation: -0.30, Sign: negative, Start Timestamp: 2018-08-17T14:30:00Z, End Timestamp: 2018-09-28T09:30:00Z, Start Row: 42750, End Row: 43500
Correlation: 0.32, Sign: positive, Start Timestamp: 2018-09-28T09:45:00Z, End Timestamp: 2018-11-01T15:00:00Z, Start Row: 43500, End Row: 44250
Correlation: 0.49, Sign: positive, Start Timestamp: 2018-11-02T06:45:00Z, End Timestamp: 2018-12-05T11:30:00Z, Start Row: 44250, End Row: 45000
Correlation: 0.55, Sign: positive, Start Timestamp: 2018-12-05T11:45:00Z, End Timestamp: 2019-01-09T07:00:00Z, Start Row: 45000, End Row: 45750
Correlation: 0.58, Sign: positive, Start Timestamp: 2019-01-09T07:15:00Z, End Timestamp: 2019-02-11T11:15:00Z, Start Row: 45750, End Row: 46500
Correlation: 0.28, Sign: positive, Start Timestamp: 2019-02-11T11:30:00Z, End Timestamp: 2019-03-14T14:45:00Z, Start Row: 46500, End Row: 47250
Correlation: -0.11, Sign: negative, Start Timestamp: 2019-03-14T15:00:00Z, End Timestamp: 2019-04-17T10:00:00Z, Start Row: 47250, End Row: 48000
Correlation: 0.92, Sign: positive, Start Timestamp: 2019-04-17T10:45:00Z, End Timestamp: 2019-05-22T14:15:00Z, Start Row: 48000, End Row: 48750
Correlation: -0.69, Sign: negative, Start Timestamp: 2019-05-22T14:30:00Z, End Timestamp: 2019-06-28T15:00:00Z, Start Row: 48750, End Row: 49500
In [49]:
#I wanted to see those periods with negative correlation to see which one is increasing and decreasing.

# Define the row periods for negative correlations
row_periods = [(item["Start Row"], item["End Row"]) for item in correlations_with_timestamps if item["Correlation Sign"] == "negative"]

# Iterate through the row periods and create separate plots
for start_row, end_row in row_periods:
    period_data = allticks_data[["ISFIN", "SASA"]].iloc[start_row-1:end_row]  # Adjust for 0-based indexing

    # Extract the data for each column
    x = period_data["ISFIN"]
    y = period_data["SASA"]

    # Create a new figure for each row period
    plt.figure(figsize=(8, 6))

    # Plot the data
    plt.plot(x, label="ISFIN", color="blue")
    plt.plot(y, label="SASA", color="red")

    # Set labels and title
    plt.xlabel("Row")
    plt.ylabel("Value")
    plt.title(f"Rows {start_row}-{end_row}")

    # Add a legend
    plt.legend()

    # Show the individual plot
    plt.show()

I chose ISFIN and SASA since they have the highest coefficients of variance, therefore, these two companies have the highest variations within this data (without having the effect of the magnitude of their stock values). I analyzed the periods where their correlations have negative sign. I aimed to see their unique relation in such periods rather than those periods where they can be affected from the overall market and have the same resulting change.

In the beginning, i.e. rows 1-750; ISFIN stock prices were higher than SASA but it started to decline through the end of this monthly period. SASA on the other hand, starts to increase. It can also be seen from the Google Trends data. It can be seen that SASA is trending more and more towards the end of the period.

Correlation data, IMG1: [https://ibb.co/JxY8y2k] Google Trends data, IMG2: [https://ibb.co/LJHJSYR]

In rows 26250-27000, (Start Timestamp: 2016-07-18T08:15:00Z, End Timestamp: 2016-08-19T14:30:00Z), ISFIN’s stock prices started lower than SASA’s but surpassed it at the middle of the month. It can be seen that after the beginning of the month, ISFIN is increasing and SASA is decreasing. Also in Google Trends data, even though SASA(red) has more number of searches in total, ISFIN had 7 peaks but SASA had 6. This shows that ISFIN has gained popularity in this month. Correlation data, IMG3: [https://ibb.co/3Y3Rn2Q] Google Trends data, IMG4: [https://ibb.co/0FSBNRH]

In [26]:
#I selected these two because they had the highest correlation

# Select the specified columns
selected_columns = ["TRKCM", "EREGL"]

# Initialize a list to store correlation values and corresponding timestamps
correlations_with_timestamps = []

# Calculate the correlation and create and display separate plots for each set of 750 rows
num_sets = len(allticks_data) // 750

for i in range(num_sets):
    start_row = i * 750
    end_row = (i + 1) * 750
    period_data = allticks_data[selected_columns][start_row:end_row]
    
    # Drop rows with missing values
    period_data.dropna(inplace=True)
    
    # Extract the data for each column
    x = period_data["TRKCM"]
    y = period_data["EREGL"]
    
    # Calculate the correlation coefficient and correlation
    correlation_coefficient = np.corrcoef(x, y)[0, 1]
    correlation = np.corrcoef(x, y)[0, 1]  # Calculate the correlation itself
    
    # Get the start and end timestamps
    start_timestamp = allticks_data["timestamp"].iloc[start_row]
    end_timestamp = allticks_data["timestamp"].iloc[end_row - 1]
    
    # Create a scatter plot for the period
    plt.figure(figsize=(8, 6))
    plt.scatter(x, y, label="Data Points", color="blue")
    
    # Set labels and title
    plt.xlabel("TRKCM")
    plt.ylabel("EREGL")
    plt.title(f"Scatter Plot for Rows {start_row+1}-{end_row}")
    
    # Add text to display the correlation coefficient and correlation
    plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)
    plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)
    
    # Display the plot
    plt.legend(loc="upper right")
    plt.show()
    
    # Determine the correlation sign based on the correlation coefficient
    correlation_sign = "positive" if correlation_coefficient > 0 else "negative"
    
    # Add the correlation and timestamps to the list
    correlations_with_timestamps.append({
        "Correlation": correlation,
        "Correlation Sign": correlation_sign,
        "Start Timestamp": start_timestamp,
        "End Timestamp": end_timestamp,
        "Start Row": start_row,
        "End Row": end_row
    })

# Display the list of correlation values and timestamps
for item in correlations_with_timestamps:
    print(f"Correlation: {item['Correlation']:.2f}, Sign: {item['Correlation Sign']}, Start Timestamp: {item['Start Timestamp']}, End Timestamp: {item['End Timestamp']}, Start Row: {item['Start Row']}, End Row: {item['End Row']}")
Correlation: 0.59, Sign: positive, Start Timestamp: 2012-09-17T06:45:00Z, End Timestamp: 2012-10-31T10:00:00Z, Start Row: 0, End Row: 750
Correlation: 0.28, Sign: positive, Start Timestamp: 2012-10-31T10:15:00Z, End Timestamp: 2012-12-11T09:00:00Z, Start Row: 750, End Row: 1500
Correlation: 0.92, Sign: positive, Start Timestamp: 2012-12-11T09:15:00Z, End Timestamp: 2013-01-22T08:45:00Z, Start Row: 1500, End Row: 2250
Correlation: 0.83, Sign: positive, Start Timestamp: 2013-01-22T09:00:00Z, End Timestamp: 2013-03-04T07:45:00Z, Start Row: 2250, End Row: 3000
Correlation: 0.53, Sign: positive, Start Timestamp: 2013-03-04T08:00:00Z, End Timestamp: 2013-04-11T13:15:00Z, Start Row: 3000, End Row: 3750
Correlation: 0.99, Sign: positive, Start Timestamp: 2013-04-11T13:30:00Z, End Timestamp: 2013-05-23T11:45:00Z, Start Row: 3750, End Row: 4500
Correlation: 0.75, Sign: positive, Start Timestamp: 2013-05-23T12:00:00Z, End Timestamp: 2013-07-03T08:45:00Z, Start Row: 4500, End Row: 5250
Correlation: 0.01, Sign: positive, Start Timestamp: 2013-07-03T09:00:00Z, End Timestamp: 2013-08-16T08:00:00Z, Start Row: 5250, End Row: 6000
Correlation: 0.56, Sign: positive, Start Timestamp: 2013-08-16T08:15:00Z, End Timestamp: 2013-09-26T12:15:00Z, Start Row: 6000, End Row: 6750
Correlation: 0.46, Sign: positive, Start Timestamp: 2013-09-26T12:30:00Z, End Timestamp: 2013-11-13T13:00:00Z, Start Row: 6750, End Row: 7500
Correlation: 0.84, Sign: positive, Start Timestamp: 2013-11-13T13:15:00Z, End Timestamp: 2013-12-23T10:15:00Z, Start Row: 7500, End Row: 8250
Correlation: -0.11, Sign: negative, Start Timestamp: 2013-12-23T12:00:00Z, End Timestamp: 2014-01-31T10:00:00Z, Start Row: 8250, End Row: 9000
Correlation: 0.70, Sign: positive, Start Timestamp: 2014-01-31T10:15:00Z, End Timestamp: 2014-03-12T08:45:00Z, Start Row: 9000, End Row: 9750
Correlation: 0.86, Sign: positive, Start Timestamp: 2014-03-12T09:00:00Z, End Timestamp: 2014-04-21T12:15:00Z, Start Row: 9750, End Row: 10500
Correlation: 0.64, Sign: positive, Start Timestamp: 2014-04-21T12:30:00Z, End Timestamp: 2014-06-04T12:45:00Z, Start Row: 10500, End Row: 11250
Correlation: 0.79, Sign: positive, Start Timestamp: 2014-06-04T13:00:00Z, End Timestamp: 2014-07-14T08:00:00Z, Start Row: 11250, End Row: 12000
Correlation: 0.56, Sign: positive, Start Timestamp: 2014-07-14T08:15:00Z, End Timestamp: 2014-08-25T12:15:00Z, Start Row: 12000, End Row: 12750
Correlation: 0.33, Sign: positive, Start Timestamp: 2014-08-25T12:30:00Z, End Timestamp: 2014-10-01T13:00:00Z, Start Row: 12750, End Row: 13500
Correlation: 0.72, Sign: positive, Start Timestamp: 2014-10-01T13:15:00Z, End Timestamp: 2014-11-14T08:15:00Z, Start Row: 13500, End Row: 14250
Correlation: 0.15, Sign: positive, Start Timestamp: 2014-11-14T08:30:00Z, End Timestamp: 2014-12-23T08:30:00Z, Start Row: 14250, End Row: 15000
Correlation: 0.22, Sign: positive, Start Timestamp: 2014-12-23T08:45:00Z, End Timestamp: 2015-01-30T10:00:00Z, Start Row: 15000, End Row: 15750
Correlation: 0.74, Sign: positive, Start Timestamp: 2015-01-30T10:15:00Z, End Timestamp: 2015-03-10T13:30:00Z, Start Row: 15750, End Row: 16500
Correlation: 0.87, Sign: positive, Start Timestamp: 2015-03-10T13:45:00Z, End Timestamp: 2015-04-16T12:15:00Z, Start Row: 16500, End Row: 17250
Correlation: 0.05, Sign: positive, Start Timestamp: 2015-04-16T12:30:00Z, End Timestamp: 2015-05-28T13:15:00Z, Start Row: 17250, End Row: 18000
Correlation: 0.28, Sign: positive, Start Timestamp: 2015-05-28T13:30:00Z, End Timestamp: 2015-07-07T06:30:00Z, Start Row: 18000, End Row: 18750
Correlation: 0.79, Sign: positive, Start Timestamp: 2015-07-07T06:45:00Z, End Timestamp: 2015-08-14T11:30:00Z, Start Row: 18750, End Row: 19500
Correlation: 0.71, Sign: positive, Start Timestamp: 2015-08-14T11:45:00Z, End Timestamp: 2015-09-22T13:00:00Z, Start Row: 19500, End Row: 20250
Correlation: 0.82, Sign: positive, Start Timestamp: 2015-09-22T13:15:00Z, End Timestamp: 2015-11-09T09:30:00Z, Start Row: 20250, End Row: 21000
Correlation: 0.93, Sign: positive, Start Timestamp: 2015-11-09T09:45:00Z, End Timestamp: 2015-12-15T09:30:00Z, Start Row: 21000, End Row: 21750
Correlation: 0.18, Sign: positive, Start Timestamp: 2015-12-15T09:45:00Z, End Timestamp: 2016-01-20T08:15:00Z, Start Row: 21750, End Row: 22500
Correlation: 0.84, Sign: positive, Start Timestamp: 2016-01-20T08:30:00Z, End Timestamp: 2016-02-23T14:45:00Z, Start Row: 22500, End Row: 23250
Correlation: 0.84, Sign: positive, Start Timestamp: 2016-02-23T15:00:00Z, End Timestamp: 2016-03-29T10:45:00Z, Start Row: 23250, End Row: 24000
Correlation: 0.72, Sign: positive, Start Timestamp: 2016-03-29T11:00:00Z, End Timestamp: 2016-05-03T07:00:00Z, Start Row: 24000, End Row: 24750
Correlation: -0.37, Sign: negative, Start Timestamp: 2016-05-03T07:15:00Z, End Timestamp: 2016-06-07T13:30:00Z, Start Row: 24750, End Row: 25500
Correlation: 0.90, Sign: positive, Start Timestamp: 2016-06-07T13:45:00Z, End Timestamp: 2016-07-18T08:00:00Z, Start Row: 25500, End Row: 26250
Correlation: 0.77, Sign: positive, Start Timestamp: 2016-07-18T08:15:00Z, End Timestamp: 2016-08-19T14:30:00Z, Start Row: 26250, End Row: 27000
Correlation: 0.56, Sign: positive, Start Timestamp: 2016-08-22T06:30:00Z, End Timestamp: 2016-09-30T13:45:00Z, Start Row: 27000, End Row: 27750
Correlation: 0.25, Sign: positive, Start Timestamp: 2016-09-30T14:00:00Z, End Timestamp: 2016-11-07T07:45:00Z, Start Row: 27750, End Row: 28500
Correlation: 0.92, Sign: positive, Start Timestamp: 2016-11-07T08:00:00Z, End Timestamp: 2016-12-09T08:45:00Z, Start Row: 28500, End Row: 29250
Correlation: 0.30, Sign: positive, Start Timestamp: 2016-12-09T09:00:00Z, End Timestamp: 2017-01-12T08:15:00Z, Start Row: 29250, End Row: 30000
Correlation: 0.83, Sign: positive, Start Timestamp: 2017-01-12T08:30:00Z, End Timestamp: 2017-02-15T07:30:00Z, Start Row: 30000, End Row: 30750
Correlation: 0.08, Sign: positive, Start Timestamp: 2017-02-15T07:45:00Z, End Timestamp: 2017-03-21T07:30:00Z, Start Row: 30750, End Row: 31500
Correlation: 0.53, Sign: positive, Start Timestamp: 2017-03-21T07:45:00Z, End Timestamp: 2017-04-21T14:15:00Z, Start Row: 31500, End Row: 32250
Correlation: 0.52, Sign: positive, Start Timestamp: 2017-04-21T14:30:00Z, End Timestamp: 2017-05-29T12:30:00Z, Start Row: 32250, End Row: 33000
Correlation: 0.40, Sign: positive, Start Timestamp: 2017-05-29T12:45:00Z, End Timestamp: 2017-07-04T11:15:00Z, Start Row: 33000, End Row: 33750
Correlation: 0.89, Sign: positive, Start Timestamp: 2017-07-04T11:30:00Z, End Timestamp: 2017-08-07T08:30:00Z, Start Row: 33750, End Row: 34500
Correlation: 0.34, Sign: positive, Start Timestamp: 2017-08-07T08:45:00Z, End Timestamp: 2017-09-13T12:45:00Z, Start Row: 34500, End Row: 35250
Correlation: 0.88, Sign: positive, Start Timestamp: 2017-09-13T13:00:00Z, End Timestamp: 2017-10-17T12:00:00Z, Start Row: 35250, End Row: 36000
Correlation: 0.94, Sign: positive, Start Timestamp: 2017-10-17T12:15:00Z, End Timestamp: 2017-11-20T09:45:00Z, Start Row: 36000, End Row: 36750
Correlation: 0.65, Sign: positive, Start Timestamp: 2017-11-20T10:45:00Z, End Timestamp: 2017-12-22T08:00:00Z, Start Row: 36750, End Row: 37500
Correlation: 0.41, Sign: positive, Start Timestamp: 2017-12-22T08:15:00Z, End Timestamp: 2018-01-25T13:30:00Z, Start Row: 37500, End Row: 38250
Correlation: -0.15, Sign: negative, Start Timestamp: 2018-01-25T13:45:00Z, End Timestamp: 2018-02-28T12:30:00Z, Start Row: 38250, End Row: 39000
Correlation: -0.35, Sign: negative, Start Timestamp: 2018-02-28T12:45:00Z, End Timestamp: 2018-04-03T12:00:00Z, Start Row: 39000, End Row: 39750
Correlation: 0.29, Sign: positive, Start Timestamp: 2018-04-03T12:15:00Z, End Timestamp: 2018-05-09T09:15:00Z, Start Row: 39750, End Row: 40500
Correlation: -0.66, Sign: negative, Start Timestamp: 2018-05-09T09:30:00Z, End Timestamp: 2018-06-12T07:45:00Z, Start Row: 40500, End Row: 41250
Correlation: -0.04, Sign: negative, Start Timestamp: 2018-06-12T08:00:00Z, End Timestamp: 2018-07-17T10:45:00Z, Start Row: 41250, End Row: 42000
Correlation: -0.23, Sign: negative, Start Timestamp: 2018-07-17T11:00:00Z, End Timestamp: 2018-08-17T14:15:00Z, Start Row: 42000, End Row: 42750
Correlation: -0.30, Sign: negative, Start Timestamp: 2018-08-17T14:30:00Z, End Timestamp: 2018-09-28T09:30:00Z, Start Row: 42750, End Row: 43500
Correlation: 0.93, Sign: positive, Start Timestamp: 2018-09-28T09:45:00Z, End Timestamp: 2018-11-01T15:00:00Z, Start Row: 43500, End Row: 44250
Correlation: 0.76, Sign: positive, Start Timestamp: 2018-11-02T06:45:00Z, End Timestamp: 2018-12-05T11:30:00Z, Start Row: 44250, End Row: 45000
Correlation: 0.57, Sign: positive, Start Timestamp: 2018-12-05T11:45:00Z, End Timestamp: 2019-01-09T07:00:00Z, Start Row: 45000, End Row: 45750
Correlation: 0.85, Sign: positive, Start Timestamp: 2019-01-09T07:15:00Z, End Timestamp: 2019-02-11T11:15:00Z, Start Row: 45750, End Row: 46500
Correlation: 0.83, Sign: positive, Start Timestamp: 2019-02-11T11:30:00Z, End Timestamp: 2019-03-14T14:45:00Z, Start Row: 46500, End Row: 47250
Correlation: 0.17, Sign: positive, Start Timestamp: 2019-03-14T15:00:00Z, End Timestamp: 2019-04-17T10:00:00Z, Start Row: 47250, End Row: 48000
Correlation: 0.73, Sign: positive, Start Timestamp: 2019-04-17T10:45:00Z, End Timestamp: 2019-05-22T14:15:00Z, Start Row: 48000, End Row: 48750
Correlation: 0.90, Sign: positive, Start Timestamp: 2019-05-22T14:30:00Z, End Timestamp: 2019-06-28T15:00:00Z, Start Row: 48750, End Row: 49500
In [59]:
#I wanted to see those periods with negative correlation to see which one is increasing and decreasing.

# Define the row periods for negative correlations
row_periods = [(item["Start Row"], item["End Row"]) for item in correlations_with_timestamps if item["Correlation Sign"] == "negative"]
]

# Iterate through the row periods and create separate plots
for start_row, end_row in row_periods:
    period_data = allticks_data[["TRKCM", "EREGL"]].iloc[start_row-1:end_row]  # Adjust for 0-based indexing

    # Extract the data for each column
    x = period_data["TRKCM"]
    y = period_data["EREGL"]

    # Create a new figure for each row period
    plt.figure(figsize=(8, 6))

    # Plot the data
    plt.plot(x, label="TRKCM", color="blue")
    plt.plot(y, label="EREGL", color="red")

    # Set labels and title
    plt.xlabel("Row")
    plt.ylabel("Value")
    plt.title(f"Rows {start_row}-{end_row}")

    # Add a legend
    plt.legend()

    # Show the individual plot
    plt.show()

I also wanted to examine TRKCM and EREGL since these two have the highest correlation. When I look at the periods when they have negative correlations, I saw that EREGLs stock prices are always higher than TRKCM and its fluctuations are more apparent from the plot. For example, in rows 8250-8500 they behaved in the same way but after the 8500th row (2014-01-06T14:15:00Z) EREGL starts to increase but TRKCM is declining. Correlation data, IMG5:[https://ibb.co/JsTQGMR]

In [63]:
#I selected these two because they had the second highest correlation. TRKCM and EREGL have the highest correlation but TRKCM doesnt't have Google Trends data for comparison.



# Select the specified columns
selected_columns = ["FROTO", "EREGL"]  

# Initialize a list to store correlation values and corresponding timestamps for negative correlations
correlations_with_timestamps = []

# Calculate the correlation and create and display separate plots for each set of 750 rows
num_sets = len(allticks_data) // 750

for i in range(num_sets):
    start_row = i * 750
    end_row = (i + 1) * 750
    period_data = allticks_data[selected_columns][start_row:end_row]
    
    # Drop rows with missing values
    period_data.dropna(inplace=True)
    
    # Extract the data for each column
    x = period_data["FROTO"]  # Replacing "TRKCM" with "FROTO"
    y = period_data["EREGL"]
    
    # Calculate the correlation coefficient and correlation
    correlation_coefficient = np.corrcoef(x, y)[0, 1]
    correlation = np.corrcoef(x, y)[0, 1]  # Calculate the correlation itself
    
    # Get the start and end timestamps
    start_timestamp = allticks_data["timestamp"].iloc[start_row]
    end_timestamp = allticks_data["timestamp"].iloc[end_row - 1]
    
    # Determine the correlation sign based on the correlation coefficient
    correlation_sign = "positive" if correlation_coefficient > 0 else "negative"
    
    # Create a scatter plot for the period only if the correlation is negative
    if correlation_sign == "negative":
        # Create a scatter plot for the period
        plt.figure(figsize=(8, 6))
        plt.scatter(x, y, label="Data Points", color="blue")
        
        # Set labels and title
        plt.xlabel("FROTO")  # Adjusted for "FROTO"
        plt.ylabel("EREGL")
        plt.title(f"Scatter Plot for Rows {start_row+1}-{end_row}")
        
        # Add text to display the correlation coefficient and correlation
        plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)
        plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)
        
        # Display the plot
        plt.legend(loc="upper right")
        plt.show()
    
    # Add the correlation and timestamps to the list
    correlations_with_timestamps.append({
        "Correlation": correlation,
        "Correlation Sign": correlation_sign,
        "Start Timestamp": start_timestamp,
        "End Timestamp": end_timestamp,
        "Start Row": start_row,
        "End Row": end_row
    })
for item in correlations_with_timestamps:
    print(f"Correlation: {item['Correlation']:.2f}, Sign: {item['Correlation Sign']}, Start Timestamp: {item['Start Timestamp']}, End Timestamp: {item['End Timestamp']}, Start Row: {item['Start Row']}, End Row: {item['End Row']}")
Correlation: -0.09, Sign: negative, Start Timestamp: 2012-09-17T06:45:00Z, End Timestamp: 2012-10-31T10:00:00Z, Start Row: 0, End Row: 750
Correlation: 0.76, Sign: positive, Start Timestamp: 2012-10-31T10:15:00Z, End Timestamp: 2012-12-11T09:00:00Z, Start Row: 750, End Row: 1500
Correlation: 0.56, Sign: positive, Start Timestamp: 2012-12-11T09:15:00Z, End Timestamp: 2013-01-22T08:45:00Z, Start Row: 1500, End Row: 2250
Correlation: 0.80, Sign: positive, Start Timestamp: 2013-01-22T09:00:00Z, End Timestamp: 2013-03-04T07:45:00Z, Start Row: 2250, End Row: 3000
Correlation: 0.60, Sign: positive, Start Timestamp: 2013-03-04T08:00:00Z, End Timestamp: 2013-04-11T13:15:00Z, Start Row: 3000, End Row: 3750
Correlation: 0.94, Sign: positive, Start Timestamp: 2013-04-11T13:30:00Z, End Timestamp: 2013-05-23T11:45:00Z, Start Row: 3750, End Row: 4500
Correlation: 0.50, Sign: positive, Start Timestamp: 2013-05-23T12:00:00Z, End Timestamp: 2013-07-03T08:45:00Z, Start Row: 4500, End Row: 5250
Correlation: 0.65, Sign: positive, Start Timestamp: 2013-07-03T09:00:00Z, End Timestamp: 2013-08-16T08:00:00Z, Start Row: 5250, End Row: 6000
Correlation: 0.66, Sign: positive, Start Timestamp: 2013-08-16T08:15:00Z, End Timestamp: 2013-09-26T12:15:00Z, Start Row: 6000, End Row: 6750
Correlation: -0.25, Sign: negative, Start Timestamp: 2013-09-26T12:30:00Z, End Timestamp: 2013-11-13T13:00:00Z, Start Row: 6750, End Row: 7500
Correlation: 0.68, Sign: positive, Start Timestamp: 2013-11-13T13:15:00Z, End Timestamp: 2013-12-23T10:15:00Z, Start Row: 7500, End Row: 8250
Correlation: -0.64, Sign: negative, Start Timestamp: 2013-12-23T12:00:00Z, End Timestamp: 2014-01-31T10:00:00Z, Start Row: 8250, End Row: 9000
Correlation: 0.61, Sign: positive, Start Timestamp: 2014-01-31T10:15:00Z, End Timestamp: 2014-03-12T08:45:00Z, Start Row: 9000, End Row: 9750
Correlation: 0.83, Sign: positive, Start Timestamp: 2014-03-12T09:00:00Z, End Timestamp: 2014-04-21T12:15:00Z, Start Row: 9750, End Row: 10500
Correlation: 0.82, Sign: positive, Start Timestamp: 2014-04-21T12:30:00Z, End Timestamp: 2014-06-04T12:45:00Z, Start Row: 10500, End Row: 11250
Correlation: -0.24, Sign: negative, Start Timestamp: 2014-06-04T13:00:00Z, End Timestamp: 2014-07-14T08:00:00Z, Start Row: 11250, End Row: 12000
Correlation: 0.81, Sign: positive, Start Timestamp: 2014-07-14T08:15:00Z, End Timestamp: 2014-08-25T12:15:00Z, Start Row: 12000, End Row: 12750
Correlation: 0.06, Sign: positive, Start Timestamp: 2014-08-25T12:30:00Z, End Timestamp: 2014-10-01T13:00:00Z, Start Row: 12750, End Row: 13500
Correlation: 0.56, Sign: positive, Start Timestamp: 2014-10-01T13:15:00Z, End Timestamp: 2014-11-14T08:15:00Z, Start Row: 13500, End Row: 14250
Correlation: 0.23, Sign: positive, Start Timestamp: 2014-11-14T08:30:00Z, End Timestamp: 2014-12-23T08:30:00Z, Start Row: 14250, End Row: 15000
Correlation: -0.26, Sign: negative, Start Timestamp: 2014-12-23T08:45:00Z, End Timestamp: 2015-01-30T10:00:00Z, Start Row: 15000, End Row: 15750
Correlation: 0.24, Sign: positive, Start Timestamp: 2015-01-30T10:15:00Z, End Timestamp: 2015-03-10T13:30:00Z, Start Row: 15750, End Row: 16500
Correlation: 0.56, Sign: positive, Start Timestamp: 2015-03-10T13:45:00Z, End Timestamp: 2015-04-16T12:15:00Z, Start Row: 16500, End Row: 17250
Correlation: 0.85, Sign: positive, Start Timestamp: 2015-04-16T12:30:00Z, End Timestamp: 2015-05-28T13:15:00Z, Start Row: 17250, End Row: 18000
Correlation: 0.08, Sign: positive, Start Timestamp: 2015-05-28T13:30:00Z, End Timestamp: 2015-07-07T06:30:00Z, Start Row: 18000, End Row: 18750
Correlation: 0.50, Sign: positive, Start Timestamp: 2015-07-07T06:45:00Z, End Timestamp: 2015-08-14T11:30:00Z, Start Row: 18750, End Row: 19500
Correlation: 0.57, Sign: positive, Start Timestamp: 2015-08-14T11:45:00Z, End Timestamp: 2015-09-22T13:00:00Z, Start Row: 19500, End Row: 20250
Correlation: 0.78, Sign: positive, Start Timestamp: 2015-09-22T13:15:00Z, End Timestamp: 2015-11-09T09:30:00Z, Start Row: 20250, End Row: 21000
Correlation: 0.84, Sign: positive, Start Timestamp: 2015-11-09T09:45:00Z, End Timestamp: 2015-12-15T09:30:00Z, Start Row: 21000, End Row: 21750
Correlation: 0.83, Sign: positive, Start Timestamp: 2015-12-15T09:45:00Z, End Timestamp: 2016-01-20T08:15:00Z, Start Row: 21750, End Row: 22500
Correlation: 0.84, Sign: positive, Start Timestamp: 2016-01-20T08:30:00Z, End Timestamp: 2016-02-23T14:45:00Z, Start Row: 22500, End Row: 23250
Correlation: 0.89, Sign: positive, Start Timestamp: 2016-02-23T15:00:00Z, End Timestamp: 2016-03-29T10:45:00Z, Start Row: 23250, End Row: 24000
Correlation: 0.85, Sign: positive, Start Timestamp: 2016-03-29T11:00:00Z, End Timestamp: 2016-05-03T07:00:00Z, Start Row: 24000, End Row: 24750
Correlation: 0.47, Sign: positive, Start Timestamp: 2016-05-03T07:15:00Z, End Timestamp: 2016-06-07T13:30:00Z, Start Row: 24750, End Row: 25500
Correlation: 0.06, Sign: positive, Start Timestamp: 2016-06-07T13:45:00Z, End Timestamp: 2016-07-18T08:00:00Z, Start Row: 25500, End Row: 26250
Correlation: 0.80, Sign: positive, Start Timestamp: 2016-07-18T08:15:00Z, End Timestamp: 2016-08-19T14:30:00Z, Start Row: 26250, End Row: 27000
Correlation: 0.87, Sign: positive, Start Timestamp: 2016-08-22T06:30:00Z, End Timestamp: 2016-09-30T13:45:00Z, Start Row: 27000, End Row: 27750
Correlation: 0.70, Sign: positive, Start Timestamp: 2016-09-30T14:00:00Z, End Timestamp: 2016-11-07T07:45:00Z, Start Row: 27750, End Row: 28500
Correlation: -0.10, Sign: negative, Start Timestamp: 2016-11-07T08:00:00Z, End Timestamp: 2016-12-09T08:45:00Z, Start Row: 28500, End Row: 29250
Correlation: 0.45, Sign: positive, Start Timestamp: 2016-12-09T09:00:00Z, End Timestamp: 2017-01-12T08:15:00Z, Start Row: 29250, End Row: 30000
Correlation: 0.78, Sign: positive, Start Timestamp: 2017-01-12T08:30:00Z, End Timestamp: 2017-02-15T07:30:00Z, Start Row: 30000, End Row: 30750
Correlation: 0.51, Sign: positive, Start Timestamp: 2017-02-15T07:45:00Z, End Timestamp: 2017-03-21T07:30:00Z, Start Row: 30750, End Row: 31500
Correlation: 0.47, Sign: positive, Start Timestamp: 2017-03-21T07:45:00Z, End Timestamp: 2017-04-21T14:15:00Z, Start Row: 31500, End Row: 32250
Correlation: 0.50, Sign: positive, Start Timestamp: 2017-04-21T14:30:00Z, End Timestamp: 2017-05-29T12:30:00Z, Start Row: 32250, End Row: 33000
Correlation: 0.21, Sign: positive, Start Timestamp: 2017-05-29T12:45:00Z, End Timestamp: 2017-07-04T11:15:00Z, Start Row: 33000, End Row: 33750
Correlation: 0.86, Sign: positive, Start Timestamp: 2017-07-04T11:30:00Z, End Timestamp: 2017-08-07T08:30:00Z, Start Row: 33750, End Row: 34500
Correlation: 0.35, Sign: positive, Start Timestamp: 2017-08-07T08:45:00Z, End Timestamp: 2017-09-13T12:45:00Z, Start Row: 34500, End Row: 35250
Correlation: 0.39, Sign: positive, Start Timestamp: 2017-09-13T13:00:00Z, End Timestamp: 2017-10-17T12:00:00Z, Start Row: 35250, End Row: 36000
Correlation: 0.90, Sign: positive, Start Timestamp: 2017-10-17T12:15:00Z, End Timestamp: 2017-11-20T09:45:00Z, Start Row: 36000, End Row: 36750
Correlation: 0.51, Sign: positive, Start Timestamp: 2017-11-20T10:45:00Z, End Timestamp: 2017-12-22T08:00:00Z, Start Row: 36750, End Row: 37500
Correlation: 0.55, Sign: positive, Start Timestamp: 2017-12-22T08:15:00Z, End Timestamp: 2018-01-25T13:30:00Z, Start Row: 37500, End Row: 38250
Correlation: 0.50, Sign: positive, Start Timestamp: 2018-01-25T13:45:00Z, End Timestamp: 2018-02-28T12:30:00Z, Start Row: 38250, End Row: 39000
Correlation: 0.19, Sign: positive, Start Timestamp: 2018-02-28T12:45:00Z, End Timestamp: 2018-04-03T12:00:00Z, Start Row: 39000, End Row: 39750
Correlation: 0.08, Sign: positive, Start Timestamp: 2018-04-03T12:15:00Z, End Timestamp: 2018-05-09T09:15:00Z, Start Row: 39750, End Row: 40500
Correlation: 0.83, Sign: positive, Start Timestamp: 2018-05-09T09:30:00Z, End Timestamp: 2018-06-12T07:45:00Z, Start Row: 40500, End Row: 41250
Correlation: 0.20, Sign: positive, Start Timestamp: 2018-06-12T08:00:00Z, End Timestamp: 2018-07-17T10:45:00Z, Start Row: 41250, End Row: 42000
Correlation: 0.12, Sign: positive, Start Timestamp: 2018-07-17T11:00:00Z, End Timestamp: 2018-08-17T14:15:00Z, Start Row: 42000, End Row: 42750
Correlation: 0.25, Sign: positive, Start Timestamp: 2018-08-17T14:30:00Z, End Timestamp: 2018-09-28T09:30:00Z, Start Row: 42750, End Row: 43500
Correlation: 0.82, Sign: positive, Start Timestamp: 2018-09-28T09:45:00Z, End Timestamp: 2018-11-01T15:00:00Z, Start Row: 43500, End Row: 44250
Correlation: 0.57, Sign: positive, Start Timestamp: 2018-11-02T06:45:00Z, End Timestamp: 2018-12-05T11:30:00Z, Start Row: 44250, End Row: 45000
Correlation: 0.57, Sign: positive, Start Timestamp: 2018-12-05T11:45:00Z, End Timestamp: 2019-01-09T07:00:00Z, Start Row: 45000, End Row: 45750
Correlation: 0.69, Sign: positive, Start Timestamp: 2019-01-09T07:15:00Z, End Timestamp: 2019-02-11T11:15:00Z, Start Row: 45750, End Row: 46500
Correlation: 0.14, Sign: positive, Start Timestamp: 2019-02-11T11:30:00Z, End Timestamp: 2019-03-14T14:45:00Z, Start Row: 46500, End Row: 47250
Correlation: 0.66, Sign: positive, Start Timestamp: 2019-03-14T15:00:00Z, End Timestamp: 2019-04-17T10:00:00Z, Start Row: 47250, End Row: 48000
Correlation: 0.44, Sign: positive, Start Timestamp: 2019-04-17T10:45:00Z, End Timestamp: 2019-05-22T14:15:00Z, Start Row: 48000, End Row: 48750
Correlation: 0.80, Sign: positive, Start Timestamp: 2019-05-22T14:30:00Z, End Timestamp: 2019-06-28T15:00:00Z, Start Row: 48750, End Row: 49500
In [62]:
# Define the row periods for negative correlations
row_periods = [(item["Start Row"], item["End Row"]) for item in correlations_with_timestamps if item["Correlation Sign"] == "negative"]

# Iterate through the row periods with negative correlations and create separate plots
for start_row, end_row in row_periods:
    period_data = allticks_data[["FROTO", "EREGL"]].iloc[start_row-1:end_row]  # Adjust for 0-based indexing and "FROTO" column

    # Extract the data for each column
    x = period_data["FROTO"]
    y = period_data["EREGL"]

    # Create a new figure for each row period
    plt.figure(figsize=(8, 6))

    # Plot the data
    plt.plot(x, label="FROTO", color="blue")
    plt.plot(y, label="EREGL", color="red")

    # Set labels and title
    plt.xlabel("Row")
    plt.ylabel("Value")
    plt.title(f"Rows {start_row}-{end_row}")

    # Add a legend
    plt.legend()

    # Show the individual plot
    plt.show()

I couldn’t find the stock related data of TRKCM in Google Trends. Thus I decided to try for FROTO and EREGL, since they have the second highest correlation. FROTO has much higher stock prices than EREGL, thus EREGL’s fluctuations aren’t seen clearly from the plots, yet it seems that when they had negative correlations; FROTO is decreasing and EREGL is increasing.

For example, in this period, FROTO seems to be decreasing and EREGL is increasing but FROTO’s stock prices are still much higher. Correlation data, IMG6: [https://ibb.co/tLmbQHy] Google Trends data, IMG7: [https://ibb.co/S56n5Fs]

In Google Trends, The average of the FROTO is higher meaning that it is searched more, butEREGL had 11 peaks while FROTO had 10 peaks. It indicates that EREGL has gained popularity in that period but FROTO is still dominating

In [53]:
####### PCA ANALYSIS


from sklearn.decomposition import PCA




# Drop the timestamp column
stock_data = allticks_data.drop(columns=['timestamp'])

# Create an imputer to fill missing values with the column mean
imputer = SimpleImputer(strategy='mean')
stock_data_imputed = imputer.fit_transform(stock_data)

# Standardize the data
scaler = StandardScaler()
stock_data_standardized = scaler.fit_transform(stock_data_imputed)

# Initialize and fit the PCA model
n_components = 60  # Number of principal components to retain  ##i first selected the number of components as the original dimension
pca = PCA(n_components=n_components)
pca.fit(stock_data_standardized)

# Get the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Create a table for component statistics
component_stats = pd.DataFrame({
    'Component': range(1, len(explained_variance_ratio) + 1),
    'Standard Deviation': np.sqrt(pca.explained_variance_),
    'Proportion of Variance': pca.explained_variance_ratio_,
    'Cumulative Proportion': np.cumsum(pca.explained_variance_ratio_)
})

# Display the component statistics table
print("Component Statistics:")
print(component_stats)

# Create a table for loadings
loadings = pd.DataFrame(pca.components_, columns=stock_data.columns)

# Display the loadings table
print("\nLoadings Table:")
print(loadings)

component_stats.to_excel('component_stats.xlsx',index=False)
loadings.to_excel('loadings.xlsx',index=False)
Component Statistics:
    Component  Standard Deviation  Proportion of Variance  \
0           1            5.411674                0.488094   
1           2            3.196438                0.170283   
2           3            2.536973                0.107268   
3           4            1.730182                0.049891   
4           5            1.511590                0.038081   
5           6            1.133295                0.021406   
6           7            0.958872                0.015324   
7           8            0.892638                0.013280   
8           9            0.783668                0.010235   
9          10            0.729850                0.008878   
10         11            0.707544                0.008343   
11         12            0.653015                0.007107   
12         13            0.611131                0.006225   
13         14            0.597257                0.005945   
14         15            0.536428                0.004796   
15         16            0.522433                0.004549   
16         17            0.456779                0.003477   
17         18            0.430460                0.003088   
18         19            0.397430                0.002632   
19         20            0.395831                0.002611   
20         21            0.365204                0.002223   
21         22            0.351648                0.002061   
22         23            0.330286                0.001818   
23         24            0.320561                0.001713   
24         25            0.312629                0.001629   
25         26            0.285812                0.001361   
26         27            0.278007                0.001288   
27         28            0.266828                0.001187   
28         29            0.259451                0.001122   
29         30            0.253319                0.001069   
30         31            0.240825                0.000967   
31         32            0.236844                0.000935   
32         33            0.221634                0.000819   
33         34            0.218693                0.000797   
34         35            0.210692                0.000740   
35         36            0.203632                0.000691   
36         37            0.198371                0.000656   
37         38            0.192681                0.000619   
38         39            0.190980                0.000608   
39         40            0.182159                0.000553   
40         41            0.168839                0.000475   
41         42            0.166813                0.000464   
42         43            0.160176                0.000428   
43         44            0.159183                0.000422   
44         45            0.154896                0.000400   
45         46            0.148436                0.000367   
46         47            0.145597                0.000353   
47         48            0.141189                0.000332   
48         49            0.137005                0.000313   
49         50            0.130059                0.000282   
50         51            0.123291                0.000253   
51         52            0.116004                0.000224   
52         53            0.114420                0.000218   
53         54            0.112003                0.000209   
54         55            0.104784                0.000183   
55         56            0.099252                0.000164   
56         57            0.095562                0.000152   
57         58            0.093453                0.000146   
58         59            0.092427                0.000142   
59         60            0.078555                0.000103   

    Cumulative Proportion  
0                0.488094  
1                0.658377  
2                0.765646  
3                0.815537  
4                0.853618  
5                0.875023  
6                0.890347  
7                0.903627  
8                0.913862  
9                0.922740  
10               0.931083  
11               0.938190  
12               0.944415  
13               0.950360  
14               0.955156  
15               0.959705  
16               0.963182  
17               0.966270  
18               0.968903  
19               0.971514  
20               0.973737  
21               0.975798  
22               0.977616  
23               0.979329  
24               0.980958  
25               0.982319  
26               0.983607  
27               0.984794  
28               0.985916  
29               0.986985  
30               0.987952  
31               0.988887  
32               0.989705  
33               0.990502  
34               0.991242  
35               0.991933  
36               0.992589  
37               0.993208  
38               0.993816  
39               0.994369  
40               0.994844  
41               0.995308  
42               0.995735  
43               0.996158  
44               0.996557  
45               0.996925  
46               0.997278  
47               0.997610  
48               0.997923  
49               0.998205  
50               0.998458  
51               0.998683  
52               0.998901  
53               0.999110  
54               0.999293  
55               0.999457  
56               0.999609  
57               0.999755  
58               0.999897  
59               1.000000  

Loadings Table:
       AEFES     AKBNK      AKSA     AKSEN     ALARK     ALBRK     ANACM  \
0  -0.010619 -0.115330 -0.162991 -0.074086 -0.139545 -0.008576 -0.168831   
1  -0.164429 -0.212004 -0.033005 -0.159788 -0.081720 -0.112583  0.058363   
2  -0.261382  0.073484  0.068234 -0.147875 -0.106180 -0.203374 -0.094810   
3   0.014416 -0.094138  0.076323 -0.196707 -0.158188  0.248636 -0.086114   
4   0.015394 -0.105984  0.066446  0.236019 -0.223156 -0.122610 -0.011328   
5  -0.018140  0.085660 -0.180828  0.241494 -0.126733  0.140711  0.105549   
6   0.075174  0.108106 -0.199153  0.119555 -0.150210  0.406538  0.056822   
7   0.059581 -0.046841 -0.060453 -0.042819 -0.004839  0.053875 -0.000587   
8  -0.114302 -0.051819 -0.063405 -0.069009  0.026365  0.118118 -0.062160   
9  -0.192159  0.000015 -0.083635 -0.118536  0.117608  0.089603 -0.042360   
10  0.455544  0.108440 -0.128275 -0.144293 -0.036502 -0.476641 -0.049088   
11  0.273656  0.002982  0.064819 -0.092101 -0.016472  0.160483 -0.059559   
12  0.102158 -0.147034  0.080613  0.008406 -0.040919 -0.093194  0.010223   
13  0.363439 -0.060055  0.171371  0.018025 -0.014562  0.263588  0.050178   
14  0.017763  0.077017  0.062910 -0.083708 -0.146870 -0.047852 -0.063514   
15  0.290299 -0.137708  0.037418  0.205699  0.118733 -0.117275  0.072106   
16  0.157388 -0.010414  0.045018 -0.348017 -0.066540 -0.061797 -0.068713   
17  0.184640 -0.024658 -0.146063 -0.056727 -0.054352 -0.154460  0.028410   
18 -0.192447  0.055082  0.059132 -0.107376  0.020367 -0.011428  0.022881   
19 -0.042429  0.117334  0.095559 -0.024542 -0.293436 -0.099039  0.007348   
20  0.106945 -0.028529  0.155411  0.254699 -0.015594 -0.222860  0.068526   
21  0.108365  0.017511  0.057691 -0.124020  0.091019  0.143323  0.078801   
22 -0.030401 -0.007885 -0.158598  0.174079 -0.054509 -0.276084  0.047905   
23 -0.162112  0.048563  0.152288  0.151761  0.124432 -0.166273 -0.010833   
24  0.114655 -0.065577  0.026070 -0.206914  0.120101  0.031381  0.100395   
25  0.033088 -0.100286 -0.058682  0.178338  0.127454  0.120163  0.043167   
26  0.014247 -0.036817 -0.258493  0.164146 -0.008177  0.094029 -0.048428   
27  0.264538  0.143681 -0.090983 -0.134268  0.017295  0.050808  0.165177   
28 -0.071104  0.001624 -0.061773 -0.209436  0.047917 -0.157911  0.014677   
29  0.044157  0.032685 -0.102867 -0.205007 -0.124667  0.110479 -0.157199   
30  0.005159  0.024566  0.036919  0.051664  0.112565  0.017934 -0.097106   
31 -0.122877 -0.042648 -0.252617  0.018889  0.007359 -0.099293 -0.040859   
32  0.024959 -0.006291 -0.000861  0.097524  0.187282  0.011025 -0.048922   
33  0.009566 -0.119572  0.011320  0.077451  0.131781 -0.013956  0.008587   
34 -0.138435  0.204523 -0.142875 -0.182783  0.131051 -0.014408 -0.020010   
35  0.010043  0.147489 -0.115384  0.153281  0.205369 -0.005152  0.054440   
36 -0.013053 -0.044556  0.180746 -0.058914  0.063750  0.101186  0.138202   
37  0.003064 -0.059021 -0.333295 -0.114103 -0.067133 -0.029699  0.051244   
38  0.005301 -0.012623  0.270708 -0.188178  0.167777  0.002485 -0.011387   
39 -0.124796 -0.110234 -0.016340  0.019319 -0.011862 -0.053443  0.077511   
40  0.080982  0.093244 -0.112591 -0.079579  0.075012 -0.012072  0.012028   
41 -0.010838 -0.000406 -0.038418  0.102599 -0.295082  0.033703 -0.034795   
42 -0.151969  0.038086  0.143270 -0.114462 -0.131161  0.020794  0.297013   
43 -0.028292 -0.190484 -0.298549 -0.097249  0.101027 -0.007354  0.106816   
44  0.016459 -0.176366 -0.073014 -0.074395  0.471003 -0.023546 -0.139236   
45  0.027183 -0.132076 -0.234859 -0.060666  0.053379  0.028489 -0.140251   
46 -0.026839 -0.066351  0.125087 -0.066543  0.001399 -0.014754  0.333992   
47 -0.060087 -0.015580 -0.043032  0.112853  0.114545  0.062067 -0.151341   
48 -0.026478  0.008901  0.109863  0.049175  0.118161 -0.002270 -0.283479   
49  0.005352 -0.189703 -0.151468 -0.064979 -0.144190 -0.026037  0.259114   
50  0.034925 -0.008133 -0.000400  0.012034 -0.107193 -0.004784 -0.285492   
51  0.028093 -0.241729  0.082577 -0.002079 -0.145300  0.016448 -0.416547   
52  0.027884 -0.130415 -0.033874 -0.017833 -0.035384  0.034810 -0.081980   
53  0.007259  0.250916 -0.041261 -0.020594  0.145511 -0.003784  0.098696   
54 -0.050117 -0.033406 -0.018522 -0.042330  0.086692 -0.034753  0.122512   
55  0.040288  0.034894  0.075279  0.014791  0.016482 -0.029701 -0.123787   
56  0.027306  0.302790 -0.044487  0.020807  0.077065  0.001640 -0.159534   
57 -0.016781 -0.509135  0.104882  0.008200  0.048031 -0.003710  0.020242   
58  0.001090 -0.215202 -0.126375 -0.013029  0.009917 -0.003117  0.159936   
59  0.000757  0.012502  0.072984 -0.034331 -0.021212 -0.003769  0.088067   

       ARCLK     ASELS     ASUZU  ...     TTKOM     TUKAS     TUPRS      USAK  \
0  -0.124655 -0.176649 -0.152185  ...  0.011291 -0.115707 -0.169414 -0.130355   
1  -0.032705  0.003778 -0.088927  ... -0.268609  0.065278  0.096402 -0.142497   
2   0.251670  0.005959 -0.127121  ...  0.110387  0.057695 -0.011123  0.079051   
3  -0.038360 -0.090251  0.095978  ...  0.114039  0.104382 -0.033534 -0.177370   
4  -0.123666  0.063168  0.098447  ... -0.066522 -0.320755 -0.067071  0.091815   
5  -0.161080 -0.056906 -0.020432  ...  0.010542  0.263910  0.089874 -0.001966   
6   0.067813 -0.024458 -0.065776  ... -0.114462 -0.278942  0.020599 -0.153688   
7  -0.104345  0.113283  0.051435  ... -0.082362  0.031030  0.046583  0.016580   
8   0.006664  0.119906 -0.160681  ... -0.133465  0.031005  0.147925  0.228954   
9   0.023125 -0.034343 -0.008719  ... -0.074891  0.012079 -0.061714 -0.067155   
10 -0.055180 -0.030730  0.008656  ... -0.088462 -0.140438 -0.034052 -0.196688   
11  0.066094  0.069978  0.084270  ...  0.068673 -0.027331 -0.008766 -0.088879   
12  0.048971 -0.019740 -0.021615  ...  0.030224 -0.064646  0.000524  0.143203   
13  0.103854 -0.057010 -0.085996  ...  0.020913  0.173693  0.031591 -0.059036   
14  0.031915 -0.109372 -0.097648  ...  0.319629  0.357978  0.050767  0.126956   
15 -0.129523  0.034016  0.137257  ... -0.095622  0.240360  0.047355  0.023699   
16 -0.028427 -0.021454  0.097923  ... -0.259315  0.100060 -0.011701  0.157115   
17  0.050730  0.033792 -0.211358  ...  0.259063  0.034795  0.018639 -0.133439   
18 -0.126534 -0.019365 -0.040688  ...  0.180777 -0.081383 -0.011710 -0.076632   
19  0.039728 -0.060765 -0.135635  ... -0.004601 -0.194376 -0.069111  0.180816   
20  0.005931 -0.096832  0.109330  ... -0.022625  0.083732 -0.090806  0.058641   
21 -0.037471 -0.009767  0.136786  ...  0.065938 -0.203248 -0.055833  0.083576   
22 -0.010573  0.038690 -0.066436  ... -0.016891  0.076976  0.022905 -0.251359   
23  0.067526  0.064384 -0.028129  ... -0.125857  0.238077 -0.066952 -0.150650   
24 -0.026919  0.113481 -0.147130  ...  0.111072 -0.044124 -0.099316  0.072469   
25 -0.090686 -0.037084 -0.067807  ...  0.217019 -0.042028 -0.016617 -0.029907   
26  0.061717  0.097531  0.126139  ...  0.139385  0.276400 -0.097761  0.100972   
27  0.106050 -0.022816  0.001448  ...  0.163660 -0.033907 -0.080165  0.275601   
28 -0.185517  0.055055  0.037534  ...  0.173920 -0.011963  0.164670  0.101822   
29 -0.048602  0.008294 -0.010636  ... -0.069431  0.142148  0.101402 -0.091956   
30 -0.081624  0.163102  0.307790  ... -0.191309 -0.075409 -0.013078 -0.068913   
31  0.016432 -0.024424  0.360749  ...  0.389031 -0.219247 -0.009343  0.063870   
32  0.196253 -0.139004  0.285699  ... -0.050296  0.068673 -0.014651  0.129046   
33 -0.182715 -0.048886 -0.025525  ...  0.086549 -0.168292  0.078617  0.025893   
34  0.095516  0.062422  0.192874  ... -0.082725  0.000227 -0.146867 -0.183764   
35  0.042084  0.034378 -0.235333  ...  0.118004 -0.190314  0.030513 -0.133813   
36 -0.167164  0.067171  0.081514  ... -0.031567 -0.038715 -0.043978 -0.186232   
37  0.059351 -0.042778  0.278041  ... -0.162018  0.039971 -0.072167  0.118201   
38 -0.045813 -0.214241  0.144298  ...  0.200754  0.060305 -0.052715 -0.368025   
39 -0.026885 -0.004904  0.118536  ...  0.183545  0.144384 -0.113019  0.061747   
40 -0.414448  0.102412 -0.190054  ... -0.037817  0.103095 -0.171996  0.260250   
41 -0.009521 -0.030462 -0.075652  ... -0.063382  0.013423 -0.116174 -0.031475   
42 -0.196923 -0.087439  0.160789  ... -0.074668  0.005044  0.047029  0.188417   
43 -0.194654 -0.051763 -0.040742  ... -0.035144  0.094323  0.087288 -0.118622   
44  0.247694 -0.146731 -0.203355  ... -0.126526 -0.097950 -0.049801  0.175830   
45 -0.181327  0.012670 -0.033894  ...  0.007198  0.054593 -0.069217 -0.038636   
46 -0.068321  0.179454 -0.073943  ... -0.029118 -0.058548 -0.081229 -0.074020   
47 -0.039079 -0.148131 -0.137673  ... -0.124314  0.059853 -0.270345  0.044812   
48 -0.275193  0.037574  0.091101  ...  0.095810 -0.075606  0.297845  0.107232   
49  0.340168 -0.012469  0.073329  ... -0.023999 -0.015899  0.095437 -0.057028   
50  0.075673 -0.075572  0.100334  ...  0.067682  0.014641 -0.009953 -0.050865   
51  0.006275  0.472322 -0.003302  ...  0.055476 -0.019919  0.045956 -0.040336   
52  0.101918  0.119728  0.100622  ...  0.028013 -0.027735 -0.187413  0.076840   
53  0.165297  0.565882 -0.013550  ...  0.048270  0.070423  0.106546 -0.022905   
54 -0.066018 -0.044682 -0.028647  ...  0.014358  0.033076 -0.116004  0.009319   
55  0.101909 -0.196163 -0.070056  ... -0.090041 -0.042800  0.431337  0.084701   
56 -0.068488 -0.170296  0.068620  ... -0.064024 -0.002967  0.097151 -0.018809   
57  0.080839  0.028037 -0.050006  ...  0.049323 -0.029921 -0.123425  0.035497   
58  0.033552 -0.055824  0.028844  ...  0.025924 -0.024820  0.514393 -0.041002   
59 -0.005886  0.107141  0.025298  ... -0.009675 -0.012164  0.023458 -0.011096   

       VAKBN     VESTL     YATAS     YKBNK     YUNSA     ZOREN  
0  -0.097293 -0.157651 -0.168918  0.037678 -0.087511 -0.131610  
1  -0.249343  0.035116 -0.017233 -0.279360 -0.000860 -0.097437  
2   0.022534  0.082714 -0.071226 -0.055424 -0.267015  0.069573  
3  -0.075411  0.135683 -0.045575  0.010490  0.188490  0.207352  
4  -0.102075 -0.027652  0.004609 -0.123473  0.161817  0.143059  
5   0.020483  0.136444 -0.026446  0.099011 -0.023845  0.112381  
6   0.105345 -0.001731 -0.106294  0.066752  0.058542  0.065990  
7  -0.040193 -0.088092  0.114997 -0.123584 -0.105593  0.059178  
8  -0.037669 -0.195789 -0.140023 -0.068553 -0.052067 -0.184885  
9   0.002835  0.246614  0.020698  0.151362 -0.094010  0.243209  
10 -0.040833  0.073624  0.050855  0.156392 -0.009638 -0.007291  
11  0.041417 -0.052017  0.258278 -0.087463 -0.338253  0.012136  
12  0.036069  0.114764  0.046831 -0.029281  0.177640 -0.072637  
13 -0.018914 -0.133763 -0.030164 -0.080943 -0.048286 -0.304335  
14 -0.002722  0.077823  0.014745  0.090558  0.059591 -0.042285  
15 -0.104422  0.033216 -0.135501 -0.037410  0.062810 -0.161058  
16  0.078515  0.221362  0.103737  0.035183  0.024489 -0.155844  
17 -0.067043 -0.030218  0.016484 -0.162123  0.024618  0.292681  
18  0.004022 -0.171897  0.194950  0.056921  0.397113 -0.296935  
19  0.101843 -0.303697  0.079532 -0.118529 -0.103109 -0.059768  
20  0.020304 -0.037174 -0.209290 -0.156873 -0.203302  0.260174  
21 -0.084727  0.086178 -0.015370 -0.004374 -0.212708 -0.085157  
22  0.052818 -0.008955  0.066427 -0.200032 -0.018244 -0.214457  
23  0.061629 -0.034048  0.107109  0.113638  0.195566 -0.023223  
24 -0.120516 -0.061378  0.139224 -0.147479  0.378110  0.239333  
25 -0.142125 -0.036263  0.095360 -0.068822 -0.112429 -0.138734  
26 -0.008243 -0.190749  0.181972 -0.045117 -0.074120 -0.020122  
27 -0.007852  0.110178 -0.315156 -0.050448  0.199474 -0.025504  
28 -0.078951 -0.182567 -0.044970  0.188885 -0.101274  0.231323  
29  0.014295 -0.065630  0.086595 -0.243597  0.084445  0.102730  
30  0.061431 -0.182519 -0.235187  0.140675  0.162455  0.116428  
31  0.091447  0.099113 -0.195808 -0.116719 -0.036282 -0.201761  
32  0.163081 -0.090998  0.157363 -0.149370  0.179985  0.171750  
33  0.081526  0.287324  0.386418  0.174639 -0.033254 -0.106233  
34 -0.094853  0.143703 -0.117339 -0.143349  0.025409 -0.254484  
35  0.127179 -0.005158  0.176035 -0.134187  0.017659  0.045184  
36 -0.037365  0.149862  0.086180 -0.112522 -0.238988  0.080567  
37  0.078318 -0.236720  0.358247  0.083786 -0.044651  0.037078  
38  0.225834 -0.216257 -0.092689 -0.039684 -0.058540  0.024330  
39  0.026749  0.142367  0.036135  0.013533 -0.046625 -0.039870  
40  0.190792 -0.018509 -0.024903  0.157025 -0.095567 -0.036821  
41  0.160414  0.349410  0.090798 -0.205143  0.079729  0.044719  
42  0.034564 -0.031061  0.056238 -0.275459 -0.036847  0.019721  
43  0.066393 -0.139757 -0.112957 -0.066630 -0.041057  0.102084  
44 -0.023963  0.020909  0.004391 -0.125744 -0.053889  0.056183  
45  0.291364  0.113996 -0.140177 -0.172513  0.083299  0.003329  
46  0.156611 -0.024379 -0.049944  0.194328 -0.042631 -0.010570  
47 -0.229786  0.053449 -0.075186  0.128416 -0.043417  0.014165  
48 -0.265252  0.102687  0.076005 -0.103218  0.000829  0.038499  
49 -0.035370  0.045991  0.054114  0.272486 -0.011652  0.003103  
50 -0.210054 -0.070880  0.062943  0.178426  0.003477 -0.022193  
51  0.287339  0.002066 -0.061784  0.060832 -0.032127 -0.025123  
52 -0.099243  0.099843  0.021899 -0.099763 -0.031748 -0.013786  
53 -0.039708  0.066412 -0.014855 -0.084461 -0.026296  0.015411  
54  0.263428  0.049992 -0.028897 -0.048390 -0.014019 -0.041728  
55  0.346039  0.036969 -0.052743 -0.003505 -0.009637  0.002799  
56  0.125464  0.051240  0.040458 -0.094102 -0.000452 -0.032783  
57  0.154996 -0.001366 -0.011249  0.003733  0.032426 -0.012475  
58 -0.066220  0.063381 -0.092379  0.058802  0.024252 -0.039279  
59 -0.037163  0.021461 -0.013667 -0.045754 -0.003362 -0.057624  

[60 rows x 60 columns]
---------------------------------------------------------------------------
PermissionError                           Traceback (most recent call last)
Cell In[53], line 48
     45 print("\nLoadings Table:")
     46 print(loadings)
---> 48 component_stats.to_excel('component_stats.xlsx',index=False)
     49 loadings.to_excel('loadings.xlsx',index=False)

File ~\anaconda3\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
    209     else:
    210         kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File ~\anaconda3\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
    209     else:
    210         kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File ~\anaconda3\lib\site-packages\pandas\core\generic.py:2374, in NDFrame.to_excel(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, encoding, inf_rep, verbose, freeze_panes, storage_options)
   2361 from pandas.io.formats.excel import ExcelFormatter
   2363 formatter = ExcelFormatter(
   2364     df,
   2365     na_rep=na_rep,
   (...)
   2372     inf_rep=inf_rep,
   2373 )
-> 2374 formatter.write(
   2375     excel_writer,
   2376     sheet_name=sheet_name,
   2377     startrow=startrow,
   2378     startcol=startcol,
   2379     freeze_panes=freeze_panes,
   2380     engine=engine,
   2381     storage_options=storage_options,
   2382 )

File ~\anaconda3\lib\site-packages\pandas\io\formats\excel.py:944, in ExcelFormatter.write(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options)
    940     need_save = False
    941 else:
    942     # error: Cannot instantiate abstract class 'ExcelWriter' with abstract
    943     # attributes 'engine', 'save', 'supported_extensions' and 'write_cells'
--> 944     writer = ExcelWriter(  # type: ignore[abstract]
    945         writer, engine=engine, storage_options=storage_options
    946     )
    947     need_save = True
    949 try:

File ~\anaconda3\lib\site-packages\pandas\io\excel\_openpyxl.py:60, in OpenpyxlWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)
     56 from openpyxl.workbook import Workbook
     58 engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
---> 60 super().__init__(
     61     path,
     62     mode=mode,
     63     storage_options=storage_options,
     64     if_sheet_exists=if_sheet_exists,
     65     engine_kwargs=engine_kwargs,
     66 )
     68 # ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from
     69 # the file and later write to it
     70 if "r+" in self._mode:  # Load from existing workbook

File ~\anaconda3\lib\site-packages\pandas\io\excel\_base.py:1313, in ExcelWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)
   1309 self._handles = IOHandles(
   1310     cast(IO[bytes], path), compression={"compression": None}
   1311 )
   1312 if not isinstance(path, ExcelWriter):
-> 1313     self._handles = get_handle(
   1314         path, mode, storage_options=storage_options, is_text=False
   1315     )
   1316 self._cur_sheet = None
   1318 if date_format is None:

File ~\anaconda3\lib\site-packages\pandas\io\common.py:865, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    856         handle = open(
    857             handle,
    858             ioargs.mode,
   (...)
    861             newline="",
    862         )
    863     else:
    864         # Binary mode
--> 865         handle = open(handle, ioargs.mode)
    866     handles.append(handle)
    868 # Convert BytesIO or file objects passed with an encoding

PermissionError: [Errno 13] Permission denied: 'component_stats.xlsx'
In [43]:
##to see the companies that behave in the most similar way


# 'n_components' is the total number of components

# Create a dictionary to store the counts of same-sign numbers for pairs of companies
similar_sign_counts = {}

for component_number in range(n_components):
    # Get the loadings for the current component
    component_loadings = loadings.iloc[component_number, :]
    
    # Identify companies with positive and negative correlations
    positive_correlations = component_loadings[component_loadings > 0].index
    negative_correlations = component_loadings[component_loadings < 0].index
    
    # Update the dictionary with positive correlations
    for i, company1 in enumerate(positive_correlations):
        for j, company2 in enumerate(positive_correlations):
            if i != j:
                pair = (company1, company2)
                if pair not in similar_sign_counts:
                    similar_sign_counts[pair] = 0
                similar_sign_counts[pair] += 1
    
    # Update the dictionary with negative correlations
    for i, company1 in enumerate(negative_correlations):
        for j, company2 in enumerate(negative_correlations):
            if i != j:
                pair = (company1, company2)
                if pair not in similar_sign_counts:
                    similar_sign_counts[pair] = 0
                similar_sign_counts[pair] += 1

# Find the pairs of companies with the most common same-sign numbers
sorted_pairs = sorted(similar_sign_counts.items(), key=lambda x: x[1], reverse=True)

# Print the first 30 odd-numbered pairs and their common same-sign count  #since it lists the same pair twice
print("First 30 odd-numbered pairs of companies with the most common same-sign numbers:")
for i, (pair, count) in enumerate(sorted_pairs[:60], start=1):
    if i % 2 != 0:
        print(f"Pair: {pair}, Common Same-Sign Count: {count}")
First 30 odd-numbered pairs of companies with the most common same-sign numbers:
Pair: ('EREGL', 'KRDMD'), Common Same-Sign Count: 47
Pair: ('ANACM', 'SISE'), Common Same-Sign Count: 41
Pair: ('KCHOL', 'TUPRS'), Common Same-Sign Count: 41
Pair: ('TKFEN', 'ASELS'), Common Same-Sign Count: 41
Pair: ('BAGFS', 'TTKOM'), Common Same-Sign Count: 40
Pair: ('AYGAZ', 'USAK'), Common Same-Sign Count: 40
Pair: ('KCHOL', 'SODA'), Common Same-Sign Count: 40
Pair: ('SAHOL', 'OTKAR'), Common Same-Sign Count: 40
Pair: ('SODA', 'KCHOL'), Common Same-Sign Count: 40
Pair: ('SODA', 'TTKOM'), Common Same-Sign Count: 40
Pair: ('ANACM', 'SODA'), Common Same-Sign Count: 39
Pair: ('AYGAZ', 'ECILC'), Common Same-Sign Count: 39
Pair: ('ECILC', 'AYGAZ'), Common Same-Sign Count: 39
Pair: ('GOODY', 'ISCTR'), Common Same-Sign Count: 39
Pair: ('KCHOL', 'FROTO'), Common Same-Sign Count: 39
Pair: ('SODA', 'ANACM'), Common Same-Sign Count: 39
Pair: ('AKBNK', 'GARAN'), Common Same-Sign Count: 38
Pair: ('ALBRK', 'BANVT'), Common Same-Sign Count: 38
Pair: ('ANACM', 'TUPRS'), Common Same-Sign Count: 38
Pair: ('CEMAS', 'SAHOL'), Common Same-Sign Count: 38
Pair: ('GARAN', 'ISDMR'), Common Same-Sign Count: 38
Pair: ('ISCTR', 'VAKBN'), Common Same-Sign Count: 38
Pair: ('KAREL', 'PRKME'), Common Same-Sign Count: 38
Pair: ('PRKME', 'KAREL'), Common Same-Sign Count: 38
Pair: ('SASA', 'THYAO'), Common Same-Sign Count: 38
Pair: ('THYAO', 'PETKM'), Common Same-Sign Count: 38
Pair: ('THYAO', 'TUKAS'), Common Same-Sign Count: 38
Pair: ('TRKCM', 'TKFEN'), Common Same-Sign Count: 38
Pair: ('TUKAS', 'THYAO'), Common Same-Sign Count: 38
Pair: ('VAKBN', 'ISCTR'), Common Same-Sign Count: 38
In [56]:
#### from the loadings table, it can be seen that 14th component covers 95% of the variance. 
#I made the PCA analysis one more time with 14 components.

    
    ####### PCA ANALYSIS


from sklearn.decomposition import PCA
import pandas as pd
import numpy as np

# Assuming your DataFrame is named "allticks_data" with the stock prices

# Drop the timestamp column
stock_data = allticks_data.drop(columns=['timestamp'])

# Create an imputer to fill missing values with the column mean
imputer = SimpleImputer(strategy='mean')
stock_data_imputed = imputer.fit_transform(stock_data)

# Standardize the data
scaler = StandardScaler()
stock_data_standardized = scaler.fit_transform(stock_data_imputed)

# Initialize and fit the PCA model
n_components = 14  # Number of principal components to retain
pca = PCA(n_components=n_components)
pca.fit(stock_data_standardized)

# Get the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Create a table for component statistics
component_stats = pd.DataFrame({
    'Component': range(1, len(explained_variance_ratio) + 1),
    'Standard Deviation': np.sqrt(pca.explained_variance_),
    'Proportion of Variance': pca.explained_variance_ratio_,
    'Cumulative Proportion': np.cumsum(pca.explained_variance_ratio_)
})

# Display the component statistics table
print("Component Statistics:")
print(component_stats)

# Create a table for loadings
loadings = pd.DataFrame(pca.components_, columns=stock_data.columns)

# Display the loadings table
print("\nLoadings Table:")
print(loadings)

component_stats.to_excel('component_stats2.xlsx',index=False)
loadings.to_excel('loadings2.xlsx',index=False)

    
Component Statistics:
    Component  Standard Deviation  Proportion of Variance  \
0           1            5.411674                0.488094   
1           2            3.196438                0.170283   
2           3            2.536973                0.107268   
3           4            1.730182                0.049891   
4           5            1.511590                0.038081   
5           6            1.133295                0.021406   
6           7            0.958872                0.015324   
7           8            0.892638                0.013280   
8           9            0.783668                0.010235   
9          10            0.729850                0.008878   
10         11            0.707544                0.008343   
11         12            0.653014                0.007107   
12         13            0.611126                0.006224   
13         14            0.597243                0.005945   

    Cumulative Proportion  
0                0.488094  
1                0.658377  
2                0.765646  
3                0.815537  
4                0.853618  
5                0.875023  
6                0.890347  
7                0.903627  
8                0.913862  
9                0.922740  
10               0.931083  
11               0.938190  
12               0.944415  
13               0.950360  

Loadings Table:
       AEFES     AKBNK      AKSA     AKSEN     ALARK     ALBRK     ANACM  \
0  -0.010619 -0.115330 -0.162991 -0.074086 -0.139545 -0.008576 -0.168831   
1  -0.164429 -0.212004 -0.033005 -0.159788 -0.081720 -0.112583  0.058363   
2  -0.261382  0.073484  0.068234 -0.147875 -0.106180 -0.203374 -0.094810   
3   0.014416 -0.094138  0.076323 -0.196707 -0.158188  0.248636 -0.086114   
4   0.015394 -0.105984  0.066446  0.236019 -0.223156 -0.122610 -0.011328   
5  -0.018139  0.085660 -0.180828  0.241495 -0.126733  0.140710  0.105549   
6   0.075172  0.108106 -0.199154  0.119552 -0.150211  0.406539  0.056820   
7   0.059575 -0.046843 -0.060446 -0.042836 -0.004831  0.053880 -0.000583   
8  -0.114284 -0.051815 -0.063431 -0.068982  0.026342  0.118115 -0.062175   
9  -0.192200  0.000007 -0.083598 -0.118614  0.117635  0.089625 -0.042355   
10  0.455559  0.108448 -0.128295 -0.144293 -0.036526 -0.476646 -0.049103   
11  0.273705  0.003007  0.064817 -0.091986 -0.016509  0.160398 -0.059566   
12  0.101925 -0.147138  0.080896  0.007856 -0.040817 -0.092892  0.010144   
13  0.364150 -0.059971  0.170873  0.019021 -0.014857  0.263476  0.050102   

       ARCLK     ASELS     ASUZU  ...     TTKOM     TUKAS     TUPRS      USAK  \
0  -0.124655 -0.176649 -0.152185  ...  0.011291 -0.115707 -0.169414 -0.130355   
1  -0.032705  0.003778 -0.088927  ... -0.268609  0.065278  0.096402 -0.142497   
2   0.251670  0.005959 -0.127121  ...  0.110387  0.057695 -0.011123  0.079051   
3  -0.038360 -0.090251  0.095978  ...  0.114039  0.104382 -0.033534 -0.177370   
4  -0.123666  0.063168  0.098447  ... -0.066522 -0.320755 -0.067071  0.091815   
5  -0.161081 -0.056907 -0.020431  ...  0.010542  0.263910  0.089874 -0.001966   
6   0.067813 -0.024457 -0.065777  ... -0.114462 -0.278941  0.020598 -0.153688   
7  -0.104347  0.113289  0.051419  ... -0.082354  0.031025  0.046581  0.016583   
8   0.006667  0.119897 -0.160641  ... -0.133479  0.031025  0.147930  0.228955   
9   0.023116 -0.034316 -0.008794  ... -0.074870  0.012061 -0.061723 -0.067156   
10 -0.055182 -0.030732  0.008691  ... -0.088477 -0.140426 -0.034041 -0.196680   
11  0.066105  0.069923  0.084401  ...  0.068581 -0.027344 -0.008731 -0.088901   
12  0.048877 -0.019558 -0.022105  ...  0.030389 -0.064620  0.000541  0.143114   
13  0.103948 -0.057326 -0.085095  ...  0.020563  0.174027  0.031630 -0.059159   

       VAKBN     VESTL     YATAS     YKBNK     YUNSA     ZOREN  
0  -0.097293 -0.157651 -0.168918  0.037678 -0.087511 -0.131610  
1  -0.249343  0.035116 -0.017233 -0.279360 -0.000860 -0.097437  
2   0.022534  0.082714 -0.071226 -0.055424 -0.267015  0.069573  
3  -0.075411  0.135683 -0.045575  0.010490  0.188490  0.207352  
4  -0.102075 -0.027652  0.004609 -0.123473  0.161818  0.143059  
5   0.020483  0.136444 -0.026447  0.099011 -0.023847  0.112380  
6   0.105346 -0.001733 -0.106291  0.066751  0.058548  0.065992  
7  -0.040198 -0.088100  0.115011 -0.123578 -0.105553  0.059185  
8  -0.037660 -0.195783 -0.140053 -0.068580 -0.052145 -0.184886  
9   0.002821  0.246574  0.020774  0.151392 -0.093817  0.243242  
10 -0.040823  0.073622  0.050823  0.156377 -0.009686 -0.007276  
11  0.041462 -0.051917  0.258117 -0.087461 -0.338558  0.012085  
12  0.035977  0.114288  0.047358 -0.029182  0.178952 -0.072451  
13 -0.018846 -0.133331 -0.031088 -0.081587 -0.050757 -0.304658  

[14 rows x 60 columns]

PCA Analysis

I started the analysis with selecting the number of components as the original column dimension, 60 (after dropping the timestamp column). I observed that component 14 covers the 95% of the variance. So, by this PCA analysis, 14 components may be enough to represent the data. Even with a higher threshold of 99%, 33 components are enough. Thus, it can be said that the dimension can be reduced to half or even quarter of the original with PCA.

In PCA two features having the same-sign contribution to the components means that these companies behave similarly, such that they get affected by the changes (in market in this case) similarly. This can indicate that such companies have a similar working field. Thus, the companies that have the most common same-sign in component loadings are examined pairwise. The first pair is EREGL (EREĞLİ DEMİR VE ÇELİK FABRİKALARI )and KRDMD ( KARDEMİR KARABÜK DEMİR ÇELİK SANAYİ VE TİCARET A.Ş.) with the 47 common same signs. Their working field are exactly the same, so, it is a predictable and meaningful result. Next, ANACM( Anadolu Cam Sanayii A.Ş) and SISE (Turkiye Sise ve Cam Fabrikalari AS) with the count of 41. Again, they have the same industry. Another pair with count of 41 is KCHOL and TUPRS, TÜPRAŞ is a member of the companies of Koç Holding, so they are integrated. Another pair with count of 41 are TKFEN and ASELS. TKFEN produces chemical fertilizers, ASELSAN produces defense electronics, but they are both industrial companies that works with mass productions. They can be affected similarly from industry related investments. AKBNK (AKBANK) and GARAN (GARANTI BBVA) also have a high number of same-sign count of 38 since they are both banks. There are many pairs that seems unrelated, these pairs may have high numbers of count only because of the changes of the overall economy of the country and the world.

APPENDIX: I used ChatGPT for help with writing the codes.

Question-I have a table called allticks_data, this table has columns 61 columns; first column is timestaps, and the rest is 60 companies's stock price values at this time. I want to plot the boxplot of each companies stock prices in Phyton. can you help me?

To create box plots for the stock prices of each of the 60 companies in your allticks_data DataFrame in Python, you can use the seaborn library in conjunction with matplotlib. Here's a step-by-step guide to help you accomplish this task:

Assuming that you have a DataFrame called allticks_data, where the first column is "timestamps" and the remaining columns represent the stock prices of different companies, you can create box plots as follows:

  1. First, make sure you have the necessary libraries installed. You should have seaborn and matplotlib installed. If not, you can install them with:
pip install seaborn matplotlib
  1. Import the required libraries and set the style:
import seaborn as sns
import matplotlib.pyplot as plt
  1. Create the box plots for each company's stock prices:
# Assuming your DataFrame is named "allticks_data"

# Select all the columns containing stock prices (excluding the "timestamps" column)
stock_columns = allticks_data.columns[1:]  # Exclude the first column (timestamps)

# Create a box plot for each stock price column
for company in stock_columns:
    sns.set(style="whitegrid")  # Set the style of the plot (customize as needed)
    sns.boxplot(x=allticks_data[company])
    plt.xlabel("Price")
    plt.title(f"Box Plot of {company} Stock Price")
    plt.show()

This code will generate individual box plots for each of the 60 companies' stock prices in your DataFrame. You can customize the style and appearance of the plots to your preferences.

Make sure to adjust the code accordingly if your DataFrame or column names are different from what's mentioned in the code.

how do i calculate the variance and standard deviation of each column (except the first column, i.e. timestamp) ChatGPT To calculate the variance and standard deviation for each column (excluding the first column, i.e., the "timestamps" column) in a DataFrame, you can use the pandas library. Here's how you can do it: python Copy code import pandas as pd

Assuming your DataFrame is named "allticks_data"¶

Select all columns except the first column (timestamps)¶

columns_to_analyze = allticks_data.columns[1:]

Calculate the variance and standard deviation for each selected column¶

variances = allticks_data[columns_to_analyze].var() std_deviations = allticks_data[columns_to_analyze].std()

Create a new DataFrame to store the results¶

statistics_df = pd.DataFrame({ 'Column': columns_to_analyze, 'Variance': variances, 'Standard Deviation': std_deviations })

Display the calculated statistics¶

print(statistics_df)

This code will create a new DataFrame (statistics_df) that contains the column names, their variances, and standard deviations. You can modify the code to save the results to a file or perform further analysis as needed.

I want to plot the first 30 rows of columns named YATAS and BANVT in the same figure; i want seperate graphs for all the following 30 rows until the end of data ChatGPT To plot the first 30 rows of columns "YATAS" and "BANVT" in the same figure and then create separate graphs for each subsequent set of 30 rows until the end of the data, you can use the following code: python Copy code import matplotlib.pyplot as plt

Assuming your DataFrame is named "allticks_data"¶

Select the timestamps and the specified columns¶

timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]

Calculate the number of sets of 30 rows¶

total_rows = len(allticks_data) num_sets = total_rows // 30

Set the figure size¶

plt.figure(figsize=(12, 6))

Plot the first 30 rows for the specified columns in the same graph¶

for column in selected_columns: column_data = allticks_data[column][:30] plt.plot(timestamps[:30], column_data, label=column)

plt.xticks(rotation=45) # Rotate x-axis labels for readability

Set labels and title for the first graph¶

plt.xlabel("Timestamp") plt.ylabel("Price") plt.title("Stock Prices for the First 30 Rows")

Add a legend to distinguish between columns¶

plt.legend(loc="upper right")

Show the first graph¶

plt.show()

Create and display separate graphs for each subsequent set of 30 rows¶

for i in range(1, num_sets): plt.figure(figsize=(12, 6)) for column in selected_columns: start_row = i 30 end_row = (i + 1) 30 column_data = allticks_data[column][start_row:end_row] plt.plot(timestamps[start_row:end_row], column_data, label=column)

plt.xticks(rotation=45)  # Rotate x-axis labels for readability

# Set labels and title for subsequent graphs
plt.xlabel("Timestamp")
plt.ylabel("Price")
plt.title(f"Stock Prices for Rows {start_row+1}-{end_row}")

# Add a legend to distinguish between columns
plt.legend(loc="upper right")

# Show the subsequent graphs
plt.show()


This code will first plot the first 30 rows of "YATAS" and "BANVT" in the same graph and then create separate graphs for each subsequent set of 30 rows until the end of the data, with each set of rows having its own graph. Adjust the figure size, labels, and other visual settings as needed.

i want seperate graphs for each 30 rows until the end of data of columns named YATAS and BANVT in the same figure; ChatGPT If you want separate graphs for each set of 30 rows for columns "YATAS" and "BANVT," all in the same figure, you can modify the code as follows: python Copy code import matplotlib.pyplot as plt

Assuming your DataFrame is named "allticks_data"¶

Select the timestamps and the specified columns¶

timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]

Calculate the number of sets of 30 rows¶

total_rows = len(allticks_data) num_sets = total_rows // 30

Set the figure size¶

plt.figure(figsize=(12, 6))

Create and display separate graphs for each set of 30 rows for the specified columns¶

for i in range(num_sets): plt.figure(figsize=(12, 6)) for column in selected_columns: start_row = i 30 end_row = (i + 1) 30 column_data = allticks_data[column][start_row:end_row] plt.plot(timestamps[start_row:end_row], column_data, label=column)

plt.xticks(rotation=45)  # Rotate x-axis labels for readability

# Set labels and title for each graph
plt.xlabel("Timestamp")
plt.ylabel("Price")
plt.title(f"Stock Prices for Rows {start_row+1}-{end_row}")

# Add a legend to distinguish between columns
plt.legend(loc="upper right")

# Show each graph
plt.show()


This code will create separate graphs for each set of 30 rows for columns "YATAS" and "BANVT" and display them all in the same figure. Each set of rows will have its own graph with separate x- and y-axis labels.

i want to see the correlations of those two columns for 30 row's period ChatGPT To calculate the correlation between two columns ("YATAS" and "BANVT") for a 30-row period in your DataFrame, you can use the pandas library. You can extract the data for the specific period and then calculate the correlation between the two columns. Here's an example of how to do it: python Copy code

Assuming your DataFrame is named "allticks_data"¶

Select the timestamps and the specified columns¶

timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]

Calculate the number of sets of 30 rows¶

total_rows = len(allticks_data) num_sets = total_rows // 30

Initialize lists to store the correlations¶

correlations = []

Calculate the correlation for each set of 30 rows¶

for i in range(num_sets): start_row = i 30 end_row = (i + 1) 30 period_data = allticks_data[selected_columns][start_row:end_row] correlation = period_data.corr().iloc[0, 1] # Calculate the correlation between the two columns correlations.append(correlation)

Print or use the list of correlations¶

print(correlations)

In this code, we extract the data for each 30-row period, calculate the correlation between the "YATAS" and "BANVT" columns within that period using the .corr() method, and store the correlation values in the correlations list. You can choose to print, visualize, or analyze the correlation values as needed.

i also want to plot the correlation ChatGPT If you want to plot the correlations of "YATAS" and "BANVT" for each 30-row period, you can do so using a line plot. Here's how you can modify the code to achieve this: python Copy code import matplotlib.pyplot as plt

Assuming your DataFrame is named "allticks_data"¶

Select the timestamps and the specified columns¶

timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]

Calculate the number of sets of 30 rows¶

total_rows = len(allticks_data) num_sets = total_rows // 30

Initialize lists to store the correlations and the x-axis labels¶

correlations = [] x_labels = []

Calculate the correlation and create a plot for each set of 30 rows¶

for i in range(num_sets): start_row = i 30 end_row = (i + 1) 30 period_data = allticks_data[selected_columns][start_row:end_row] correlation = period_data.corr().iloc[0, 1] # Calculate the correlation between the two columns correlations.append(correlation) x_labels.append(f"Rows {start_row + 1}-{end_row}")

Create a line plot for the correlations¶

plt.figure(figsize=(12, 6)) plt.plot(x_labels, correlations, marker='o', linestyle='-', color='b')

Set labels and title¶

plt.xlabel("30-Row Periods") plt.ylabel("Correlation") plt.title("Correlation Between YATAS and BANVT for 30-Row Periods")

Rotate x-axis labels for readability¶

plt.xticks(rotation=45)

Show the plot¶

plt.show()

This code will create a line plot that shows the correlations between "YATAS" and "BANVT" for each 30-row period, and the x-axis labels represent the corresponding row periods. You can adjust the figure size, labels, and other visual settings as needed.

i want to have plots for each 30-row period ChatGPT If you want to create separate plots for each 30-row period and visualize the correlation between "YATAS" and "BANVT" for each period, you can modify the code as follows: python Copy code import matplotlib.pyplot as plt

Assuming your DataFrame is named "allticks_data"¶

Select the timestamps and the specified columns¶

timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]

Calculate the number of sets of 30 rows¶

total_rows = len(allticks_data) num_sets = total_rows // 30

Initialize lists to store the correlations and the x-axis labels¶

correlations = [] x_labels = []

Create and display separate plots for each set of 30 rows¶

for i in range(num_sets): start_row = i 30 end_row = (i + 1) 30 period_data = allticks_data[selected_columns][start_row:end_row] correlation = period_data.corr().iloc[0, 1] # Calculate the correlation between the two columns correlations.append(correlation) x_labels.append(f"Rows {start_row + 1}-{end_row}")

# Create a line plot for the correlation
plt.figure(figsize=(12, 6))
plt.plot(timestamps[start_row:end_row], period_data[selected_columns])

# Set labels and title for each plot
plt.xlabel("Timestamp")
plt.ylabel("Price")
plt.title(f"Stock Prices for Rows {start_row + 1}-{end_row}")

# Add a legend to distinguish between columns
plt.legend(selected_columns, loc="upper right")

# Show each plot
plt.show()


This code will create separate plots for each 30-row period, displaying the stock prices of "YATAS" and "BANVT" for each period, and calculating and displaying the correlation values in the respective plots. Each plot will have its own x-axis labels and titles. Adjust the figure size, labels, and other visual settings as needed.

can i calculate the correlation between those two columns for the first 30 rows ChatGPT Certainly, you can calculate the correlation between the "YATAS" and "BANVT" columns for the first 30 rows. Here's how you can do it: python Copy code

Assuming your DataFrame is named "allticks_data"¶

Select the timestamps and the specified columns¶

timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]

Select the first 30 rows for the specified columns¶

period_data = allticks_data[selected_columns][:30]

Calculate the correlation between the two columns¶

correlation = period_data.corr().iloc[0, 1]

Print or use the correlation value¶

print(f"Correlation between YATAS and BANVT for the first 30 rows: {correlation}")

This code will extract the data for the first 30 rows of "YATAS" and "BANVT," calculate the correlation between these two columns, and print the correlation value. You can use or analyze the correlation value as needed.

i want to see the correlations of those two columns for 30 row's period, i want a list of all correlations values but i want to see the start row and end row of each one in the list too. ChatGPT If you want to calculate the correlations between the "YATAS" and "BANVT" columns for each 30-row period and have a list that includes both the correlation values and the corresponding start and end row for each period, you can do so using the following code: python Copy code

Assuming your DataFrame is named "allticks_data"¶

Select the timestamps and the specified columns¶

timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]

Calculate the number of sets of 30 rows¶

total_rows = len(allticks_data) num_sets = total_rows // 30

Initialize a list to store correlation values and period information¶

correlations_with_periods = []

Calculate the correlation for each set of 30 rows¶

for i in range(num_sets): start_row = i 30 end_row = (i + 1) 30 period_data = allticks_data[selected_columns][start_row:end_row] correlation = period_data.corr().iloc[0, 1] # Calculate the correlation between the two columns correlations_with_periods.append((start_row, end_row, correlation))

Display the list of correlations with period information¶

for start_row, end_row, correlation in correlations_with_periods: print(f"Correlation between YATAS and BANVT for Rows {start_row + 1}-{end_row}: {correlation}")

This code will calculate the correlations for each 30-row period and create a list correlations_with_periods that contains tuples of start row, end row, and correlation value for each period. It then displays the list with the corresponding information. You can further use or analyze this list as needed.

instead of start and end row numbers, i want to see the timestamp in the timestamp column that are in those rows in the list. ChatGPT If you want to see the timestamps from the "timestamps" column for the corresponding start and end rows within each 30-row period in the list, you can modify the code as follows: python Copy code

Assuming your DataFrame is named "allticks_data"¶

Select the timestamps and the specified columns¶

timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]

Calculate the number of sets of 30 rows¶

total_rows = len(allticks_data) num_sets = total_rows // 30

Initialize a list to store correlation values and period information¶

correlations_with_periods = []

Calculate the correlation for each set of 30 rows¶

for i in range(num_sets): start_row = i 30 end_row = (i + 1) 30 period_data = allticks_data[selected_columns][start_row:end_row] correlation = period_data.corr().iloc[0, 1] # Calculate the correlation between the two columns start_timestamp = timestamps.iloc[start_row] # Get the start timestamp end_timestamp = timestamps.iloc[end_row - 1] # Get the end timestamp correlations_with_periods.append((start_timestamp, end_timestamp, correlation))

Display the list of correlations with period information¶

for start_timestamp, end_timestamp, correlation in correlations_with_periods: print(f"Correlation between YATAS and BANVT from {start_timestamp} to {end_timestamp}: {correlation}")

In this modified code, the start_timestamp and end_timestamp are extracted from the "timestamps" column for the corresponding rows in each 30-row period, and these timestamps are included in the list along with the correlation values. This provides you with the start and end timestamps for each period in the list.

i want to see the values in the column named "timestamp" ChatGPT If you want to see the values in the "timestamps" column, you can extract and print these values from your DataFrame. Here's how to do it: python Copy code

Assuming your DataFrame is named "allticks_data"¶

Extract the values from the "timestamps" column¶

timestamps_values = allticks_data['timestamps'].tolist()

Print the values¶

print(timestamps_values)

This code will extract the values from the "timestamps" column and store them in a list timestamps_values, and then print the list of timestamp values.

i want to see the first 100 columns in the column named timestamp ChatGPT If you want to view the first 100 values in the "timestamps" column, you can extract and print a subset of the values. Here's how to do it: python Copy code

Assuming your DataFrame is named "allticks_data"¶

Extract the first 100 values from the "timestamps" column¶

timestamps_values = allticks_data['timestamps'].iloc[:100]

Print the first 100 values¶

print(timestamps_values)

This code will extract the first 100 values from the "timestamps" column and print them. Adjust the number 100 in iloc[:100] if you want to view a different number of values.

I want to do the same for each 750 row. instead of start and end row numbers, i want to see the timestamp in the timestamp column that are in those rows in the list. I also want to have the plots of each 750 row, name the plots with the same name that period has on the list ChatGPT To create a list of correlations and corresponding timestamps for each 750-row period, and generate separate plots for each period while naming the plots with the period's timestamp values, you can use the following code: python Copy code import matplotlib.pyplot as plt import pandas as pd

Assuming your DataFrame is named "allticks_data"¶

Select the timestamps and the specified columns¶

timestamps = allticks_data['timestamps'] selected_columns = ["YATAS", "BANVT"]

Calculate the number of sets of 750 rows¶

total_rows = len(allticks_data) num_sets = total_rows // 750

Initialize a list to store correlation values and timestamp information¶

correlations_with_timestamps = []

Calculate the correlation and create and display separate plots for each set of 750 rows¶

for i in range(num_sets): start_row = i 750 end_row = (i + 1) 750 period_data = allticks_data[selected_columns][start_row:end_row]

# Calculate the correlation between the two columns
correlation = period_data.corr().iloc[0, 1]

# Get the start and end timestamps
start_timestamp = timestamps.iloc[start_row]
end_timestamp = timestamps.iloc[end_row - 1]

# Add the correlation and timestamp information to the list
correlations_with_timestamps.append((start_timestamp, end_timestamp, correlation))

# Create a line plot for the period and save it with the period's timestamps as the name
plt.figure(figsize=(12, 6))
plt.plot(timestamps[start_row:end_row], period_data)

# Set labels and title for each plot
plt.xlabel("Timestamp")
plt.ylabel("Price")
plt.title(f"Stock Prices for {start_timestamp} to {end_timestamp}")

# Save the plot with the name based on timestamps
plt.savefig(f"{start_timestamp}_{end_timestamp}.png")
plt.close()

Display the list of correlations with timestamp information¶

for start_timestamp, end_timestamp, correlation in correlations_with_timestamps: print(f"Correlation between YATAS and BANVT from {start_timestamp} to {end_timestamp}: {correlation}")

In this code, correlations are calculated for each 750-row period, and for each period, a line plot is created and saved with a name based on the period's timestamps. The list correlations_with_timestamps contains correlation values and timestamp information for each period, which is then printed. You can adjust the figure size, labels, and other visual settings as needed.

name the figures with row numbers instead ChatGPT To name the figures with row numbers instead of timestamps, you can modify the code as follows: python Copy code import matplotlib.pyplot as plt import pandas as pd

Assuming your DataFrame is named "allticks_data"¶

Select the specified columns¶

selected_columns = ["YATAS", "BANVT"]

Calculate the number of sets of 750 rows¶

total_rows = len(allticks_data) num_sets = total_rows // 750

Initialize a list to store correlation values¶

correlations = []

Calculate the correlation and create and display separate plots for each set of 750 rows¶

for i in range(num_sets): start_row = i 750 end_row = (i + 1) 750 period_data = allticks_data[selected_columns][start_row:end_row]

# Calculate the correlation between the two columns
correlation = period_data.corr().iloc[0, 1]
correlations.append(correlation)

# Create a line plot for the period
plt.figure(figsize=(12, 6))
plt.plot(period_data)

# Set labels and title for each plot
plt.xlabel("Row Number")
plt.ylabel("Price")
plt.title(f"Stock Prices for Rows {start_row+1}-{end_row}")

# Save the plot with the name based on row numbers
plt.savefig(f"Row_{start_row+1}_{end_row}.png")
plt.close()

Display the list of correlations¶

for i, correlation in enumerate(correlations): start_row = i 750 end_row = (i + 1) 750 print(f"Correlation between YATAS and BANVT for Rows {start_row+1}-{end_row}: {correlation}")

In this modified code, the figures are named with the corresponding row numbers for each 750-row period. The list correlations contains correlation values, and the name of each figure is based on the row numbers.

i want to plot correlation matrix between these two column for each 750 rows ChatGPT To plot a correlation matrix between the "YATAS" and "BANVT" columns for each 750-row period, you can use the following code: python Copy code import matplotlib.pyplot as plt import pandas as pd

Assuming your DataFrame is named "allticks_data"¶

Select the specified columns¶

selected_columns = ["YATAS", "BANVT"]

Calculate the number of sets of 750 rows¶

total_rows = len(allticks_data) num_sets = total_rows // 750

Create a list to store correlation matrices and the corresponding row numbers¶

correlation_matrices = []

Calculate the correlation matrix and create plots for each set of 750 rows¶

for i in range(num_sets): start_row = i 750 end_row = (i + 1) 750 period_data = allticks_data[selected_columns][start_row:end_row]

# Calculate the correlation matrix between the two columns
correlation_matrix = period_data.corr()
correlation_matrices.append(correlation_matrix)

# Create a heatmap of the correlation matrix
plt.figure(figsize=(8, 6))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none', aspect='auto')
plt.colorbar()
plt.title(f"Correlation Matrix for Rows {start_row+1}-{end_row}")

# Set x and y-axis labels
plt.xticks(range(len(selected_columns), selected_columns))
plt.yticks(range(len(selected_columns), selected_columns))

# Show the plot
plt.show()

Display the list of correlation matrices¶

for i, matrix in enumerate(correlation_matrices): start_row = i 750 end_row = (i + 1) 750 print(f"Correlation Matrix for Rows {start_row+1}-{end_row}:\n{matrix}")

In this code, we calculate the correlation matrix for the "YATAS" and "BANVT" columns for each 750-row period and create a heatmap plot of the correlation matrix. The list correlation_matrices stores the correlation matrices for each period, and they are displayed using plt.show(). You can adjust the figure size, color map, labels, and other visual settings as needed.

i want to have a scatter plot of these two columns and use poly1d () function to fit a line between these data and to see if the correlation is positive or negative. do this for the first 750 columns ChatGPT To create a scatter plot of the "YATAS" and "BANVT" columns for the first 750 rows and fit a line to the data using poly1d() to determine if the correlation is positive or negative, you can use the following code: python Copy code import matplotlib.pyplot as plt import numpy as np from numpy.polynomial.polynomial import polyfit

Assuming your DataFrame is named "allticks_data"¶

Select the first 750 rows and the specified columns¶

selected_columns = ["YATAS", "BANVT"] period_data = allticks_data[selected_columns].iloc[:750]

Extract the data for each column¶

x = period_data["YATAS"] y = period_data["BANVT"]

Create a scatter plot¶

plt.figure(figsize=(8, 6)) plt.scatter(x, y, label="Data Points", color="blue")

Fit a line using polyfit¶

b, m = polyfit(x, y, 1) fit_line = np.poly1d([b, m])

Plot the fitted line¶

plt.plot(x, fit_line(x), color="red", label=f"Fitted Line (y = {m:.2f}x + {b:.2f})")

Set labels and title¶

plt.xlabel("YATAS") plt.ylabel("BANVT") plt.title("Scatter Plot and Fitted Line for the First 750 Rows")

Determine the correlation sign based on the slope of the fitted line¶

correlation_sign = "positive" if m > 0 else "negative"

Add a legend and display the plot¶

plt.legend(loc="upper right") plt.show()

Print the correlation sign¶

print(f"Correlation is {correlation_sign}")

This code selects the data for the first 750 rows, creates a scatter plot of "YATAS" and "BANVT," fits a line to the data using polyfit(), and determines whether the correlation is positive or negative based on the slope of the fitted line.

it gave the error "SVD did not converge in Linear Least Squares". how else i can visualize the correlation ChatGPT If you encountered the "SVD did not converge in Linear Least Squares" error while trying to fit a line using polyfit, it indicates that the linear least squares fitting method had difficulties converging, possibly due to the nature of the data. In such cases, you can still visualize the correlation between the two variables using alternative methods. One common approach is to use a scatter plot and calculate the correlation coefficient. Here's how you can do that: python Copy code import matplotlib.pyplot as plt import numpy as np

Assuming your DataFrame is named "allticks_data"¶

Select the first 750 rows and the specified columns¶

selected_columns = ["YATAS", "BANVT"] period_data = allticks_data[selected_columns].iloc[:750]

Extract the data for each column¶

x = period_data["YATAS"] y = period_data["BANVT"]

Create a scatter plot¶

plt.figure(figsize=(8, 6)) plt.scatter(x, y, label="Data Points", color="blue")

Calculate the correlation coefficient¶

correlation_coefficient = np.corrcoef(x, y)[0, 1]

Set labels and title¶

plt.xlabel("YATAS") plt.ylabel("BANVT") plt.title("Scatter Plot for the First 750 Rows")

Add text to display the correlation coefficient¶

plt.text(0.2, 0.9, f"Correlation: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)

Display the plot¶

plt.legend(loc="upper right") plt.show()

Determine the correlation sign based on the correlation coefficient¶

correlation_sign = "positive" if correlation_coefficient > 0 else "negative"

Print the correlation sign¶

print(f"Correlation is {correlation_sign}")

This code creates a scatter plot and calculates the correlation coefficient using np.corrcoef(). It also displays the correlation coefficient value on the plot. The sign of the correlation can be determined based on the coefficient's sign, just as before. This approach provides a visual representation of the correlation without the need for fitting a line.

can you also print the correlation ChatGPT Certainly! I've updated the code to print the correlation coefficient along with the visualization: python Copy code import matplotlib.pyplot as plt import numpy as np

Assuming your DataFrame is named "allticks_data"¶

Select the first 750 rows and the specified columns¶

selected_columns = ["YATAS", "BANVT"] period_data = allticks_data[selected_columns].iloc[:750]

Extract the data for each column¶

x = period_data["YATAS"] y = period_data["BANVT"]

Calculate the correlation coefficient¶

correlation_coefficient = np.corrcoef(x, y)[0, 1]

Create a scatter plot¶

plt.figure(figsize=(8, 6)) plt.scatter(x, y, label="Data Points", color="blue")

Set labels and title¶

plt.xlabel("YATAS") plt.ylabel("BANVT") plt.title("Scatter Plot for the First 750 Rows")

Add text to display the correlation coefficient¶

plt.text(0.2, 0.9, f"Correlation: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)

Display the plot¶

plt.legend(loc="upper right") plt.show()

Determine the correlation sign based on the correlation coefficient¶

correlation_sign = "positive" if correlation_coefficient > 0 else "negative"

Print the correlation and its sign¶

print(f"Correlation is {correlation_coefficient:.2f}, which is {correlation_sign}")

This code calculates the correlation coefficient and prints it along with the sign of the correlation, and the correlation coefficient is also displayed on the scatter plot.

i want to see the correlation, not the correlation coefficient ChatGPT I see your request. If you want to see the correlation as the visual representation, you can use a scatter plot matrix to visualize the correlation between variables. You can use Seaborn to create a scatter plot matrix. Here's how you can do that: python Copy code import seaborn as sns import pandas as pd

Assuming your DataFrame is named "allticks_data"¶

Select the specified columns and the first 750 rows¶

selected_columns = ["YATAS", "BANVT"] period_data = allticks_data[selected_columns].iloc[:750]

Create a DataFrame with the selected columns¶

df = pd.DataFrame(data=period_data, columns=selected_columns)

Create a scatter plot matrix¶

sns.pairplot(df, kind="scatter")

Show the plot¶

plt.show()

This code will create a scatter plot matrix for the selected columns, and you can observe the correlations between the variables in the visual representation. Seaborn will also provide you with scatterplots for each combination of variables in the selected columns, allowing you to see the relationships and correlations between them.

in the code you wrote for correlation coefficient, i also want to calculate the correlation itself and print it too ChatGPT Certainly! If you want to calculate and print the correlation itself, along with the correlation coefficient, you can do the following: python Copy code import matplotlib.pyplot as plt import numpy as np

Assuming your DataFrame is named "allticks_data"¶

Select the first 750 rows and the specified columns¶

selected_columns = ["YATAS", "BANVT"] period_data = allticks_data[selected_columns].iloc[:750]

Extract the data for each column¶

x = period_data["YATAS"] y = period_data["BANVT"]

Calculate the correlation coefficient and correlation¶

correlation_coefficient = np.corrcoef(x, y)[0, 1] correlation = np.corrcoef(x, y)[0, 1] # Calculate the correlation itself

Create a scatter plot¶

plt.figure(figsize=(8, 6)) plt.scatter(x, y, label="Data Points", color="blue")

Set labels and title¶

plt.xlabel("YATAS") plt.ylabel("BANVT") plt.title("Scatter Plot for the First 750 Rows")

Add text to display the correlation coefficient and correlation¶

plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12) plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)

Display the plot¶

plt.legend(loc="upper right") plt.show()

Determine the correlation sign based on the correlation coefficient¶

correlation_sign = "positive" if correlation_coefficient > 0 else "negative"

Print the correlation, its coefficient, and its sign¶

print(f"Correlation is {correlation:.2f}, which is {correlation_sign}")

This code calculates both the correlation coefficient and the correlation itself, and prints them along with the sign of the correlation. It also displays the correlation coefficient and correlation on the scatter plot.

when i run my code it says that correlation: nan. why is that the case? ChatGPT If you're encountering "correlation: nan" in your output, it suggests that the calculated correlation coefficient is resulting in "NaN" (Not-a-Number) value. This typically occurs when one or both of the variables being used for correlation contain missing or undefined values (i.e., NaN) in the dataset. To address this issue, you can check and handle missing or undefined values in your data before calculating the correlation. Here are a few steps you can take: Check for missing values: Use pd.isna() or pd.isnull() to check for missing values in your dataset, specifically in the "YATAS" and "BANVT" columns. python Copy code missing_values = period_data.isna().any() print("Missing Values:", missing_values)

Handle missing values: If missing values are present, you can choose to either remove the rows with missing values or impute the missing values with an appropriate strategy, such as mean, median, or interpolation. For example, you can remove rows with missing values as follows: python Copy code period_data = period_data.dropna()

Or you can impute missing values with the mean: python Copy code period_data.fillna(period_data.mean(), inplace=True)

After handling missing values, try calculating the correlation again. This should resolve the issue of getting "NaN" as the correlation result. If you're still encountering the issue after checking and handling missing values, there might be other factors at play, such as unusual data characteristics or a mistake in data preprocessing. In such cases, a more detailed examination of the data and the code may be necessary to identify the specific cause of the issue.

can i write this code for it to calculate without missing values? ChatGPT Certainly! You can modify the code to calculate the correlation without missing values by removing any rows that contain missing values in the dataset. Here's how you can do that: python Copy code import matplotlib.pyplot as plt import numpy as np

Assuming your DataFrame is named "allticks_data"¶

Select the first 750 rows and the specified columns¶

selected_columns = ["YATAS", "BANVT"] period_data = allticks_data[selected_columns].iloc[:750]

Drop rows with missing values¶

period_data.dropna(inplace=True)

Extract the data for each column¶

x = period_data["YATAS"] y = period_data["BANVT"]

Calculate the correlation coefficient and correlation¶

correlation_coefficient = np.corrcoef(x, y)[0, 1] correlation = np.corrcoef(x, y)[0, 1] # Calculate the correlation itself

Create a scatter plot¶

plt.figure(figsize=(8, 6)) plt.scatter(x, y, label="Data Points", color="blue")

Set labels and title¶

plt.xlabel("YATAS") plt.ylabel("BANVT") plt.title("Scatter Plot for the First 750 Rows")

Add text to display the correlation coefficient and correlation¶

plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12) plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)

Display the plot¶

plt.legend(loc="upper right") plt.show()

Determine the correlation sign based on the correlation coefficient¶

correlation_sign = "positive" if correlation_coefficient > 0 else "negative"

Print the correlation, its coefficient, and its sign¶

print(f"Correlation is {correlation:.2f}, which is {correlation_sign}")

In this code, the dropna() method is used to remove rows with missing values, and then the correlation calculation is performed on the cleaned dataset. This ensures that the calculation is done without missing values, and you won't encounter "NaN" as the correlation result.

can you repeat this code for each 750 rows. i want to have separeter plots for each. and i want to have a list of all correlation values at the end. in the list, i also want to see the corresponding timestamp to those rows, i.e. the values in the "timestamp" column that are at the start and end rows ChatGPT To calculate and visualize correlations for each 750-row period, create separate plots for each period, and generate a list of correlation values along with the corresponding timestamps, you can use the following code: python Copy code import matplotlib.pyplot as plt import numpy as np

Assuming your DataFrame is named "allticks_data"¶

Select the specified columns¶

selected_columns = ["YATAS", "BANVT"]

Initialize a list to store correlation values and corresponding timestamps¶

correlations_with_timestamps = []

Calculate the correlation and create and display separate plots for each set of 750 rows¶

num_sets = len(allticks_data) // 750

for i in range(num_sets): start_row = i 750 end_row = (i + 1) 750 period_data = allticks_data[selected_columns][start_row:end_row]

# Drop rows with missing values
period_data.dropna(inplace=True)

# Extract the data for each column
x = period_data["YATAS"]
y = period_data["BANVT"]

# Calculate the correlation coefficient and correlation
correlation_coefficient = np.corrcoef(x, y)[0, 1]
correlation = np.corrcoef(x, y)[0, 1]  # Calculate the correlation itself

# Get the start and end timestamps
start_timestamp = allticks_data["timestamps"].iloc[start_row]
end_timestamp = allticks_data["timestamps"].iloc[end_row - 1]

# Create a scatter plot for the period
plt.figure(figsize=(8, 6))
plt.scatter(x, y, label="Data Points", color="blue")

# Set labels and title
plt.xlabel("YATAS")
plt.ylabel("BANVT")
plt.title(f"Scatter Plot for Rows {start_row+1}-{end_row}")

# Add text to display the correlation coefficient and correlation
plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)
plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)

# Display the plot
plt.legend(loc="upper right")
plt.show()

# Determine the correlation sign based on the correlation coefficient
correlation_sign = "positive" if correlation_coefficient > 0 else "negative"

# Add the correlation and timestamps to the list
correlations_with_timestamps.append({
    "Correlation": correlation,
    "Correlation Sign": correlation_sign,
    "Start Timestamp": start_timestamp,
    "End Timestamp": end_timestamp
})

Display the list of correlation values and timestamps¶

for item in correlations_with_timestamps: print(f"Correlation: {item['Correlation']:.2f}, Sign: {item['Correlation Sign']}, Start Timestamp: {item['Start Timestamp']}, End Timestamp: {item['End Timestamp']}")

This code calculates correlations, creates scatter plots, and stores the correlation values along with the corresponding timestamps in the list correlations_with_timestamps. Each element in the list contains the correlation value, sign, and the start and end timestamps for each 750-row period.

can i add up all these correlation values ChatGPT If you want to calculate the sum of all the correlation values from the list correlations_with_timestamps, you can do so by iterating through the list and summing the correlation values. Here's how you can calculate the sum of all correlation values: python Copy code

Calculate the sum of all correlation values¶

total_correlation_sum = sum(item["Correlation"] for item in correlations_with_timestamps)

Print the sum¶

print(f"Sum of all correlation values: {total_correlation_sum:.2f}")

You can add this code at the end of the previous code I provided. It calculates the sum of all correlation values and prints the result. This will give you the cumulative correlation value for all the 750-row periods.

i want to plot these two columns between these row periods seperately " 5251-6000 10501-11250 12751-13500 17251-18000 23251-24000 24000-24750 28750-29250 31501-32250 36001-36750 38251-39000 39001-39750 42751-43500 44251-45000 47251-48000 48001-48750" ChatGPT To create separate scatter plots for the "YATAS" and "BANVT" columns for the specified row periods and calculate correlations for each of these periods, you can use the following code: python Copy code import matplotlib.pyplot as plt import numpy as np

Assuming your DataFrame is named "allticks_data"¶

Define the row periods¶

row_periods = [ (5251, 6000), (10501, 11250), (12751, 13500), (17251, 18000), (23251, 24000), (24001, 24750), (28751, 29250), (31501, 32250), (36001, 36750), (38251, 39000), (39001, 39750), (42751, 43500), (44251, 45000), (47251, 48000), (48001, 48750) ]

Initialize a list to store correlation values¶

correlation_values = []

Iterate through the row periods and create plots¶

for start_row, end_row in row_periods: period_data = allticks_data[["YATAS", "BANVT"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing period_data.dropna(inplace=True)

# Extract the data for each column
x = period_data["YATAS"]
y = period_data["BANVT"]

# Calculate the correlation coefficient and correlation
correlation_coefficient = np.corrcoef(x, y)[0, 1]
correlation = np.corrcoef(x, y)[0, 1]

# Create a scatter plot for the period
plt.figure(figsize=(8, 6))
plt.scatter(x, y, label="Data Points", color="blue")

# Set labels and title
plt.xlabel("YATAS")
plt.ylabel("BANVT")
plt.title(f"Scatter Plot for Rows {start_row}-{end_row}")

# Add text to display the correlation coefficient and correlation
plt.text(0.2, 0.85, f"Correlation Coefficient: {correlation_coefficient:.2f}", transform=plt.gca().transAxes, fontsize=12)
plt.text(0.2, 0.78, f"Correlation: {correlation:.2f}", transform=plt.gca().transAxes, fontsize=12)

# Display the plot
plt.legend(loc="upper right")
plt.show()

# Determine the correlation sign based on the correlation coefficient
correlation_sign = "positive" if correlation_coefficient > 0 else "negative"

# Append the correlation value to the list
correlation_values.append({
    "Start Row": start_row,
    "End Row": end_row,
    "Correlation": correlation,
    "Correlation Sign": correlation_sign
})

Print the list of correlation values¶

for item in correlation_values: print(f"Rows {item['Start Row']}-{item['End Row']}: Correlation {item['Correlation']:.2f}, Sign: {item['Correlation Sign']}")

This code creates scatter plots and calculates correlations for each of the specified row periods, and it also prints the list of correlation values with the corresponding row period details.

i only want to plot the values in those rows ChatGPT If you want to create scatter plots for only the specified row periods and not include rows outside those periods, you can modify the code as follows: python Copy code import matplotlib.pyplot as plt import numpy as np

Assuming your DataFrame is named "allticks_data"¶

Define the row periods¶

row_periods = [ (5251, 6000), (10501, 11250), (12751, 13500), (17251, 18000), (23251, 24000), (24001, 24750), (28751, 29250), (31501, 32250), (36001, 36750), (38251, 39000), (39001, 39750), (42751, 43500), (44251, 45000), (47251, 48000), (48001, 48750) ]

Initialize a list to store correlation values¶

correlation_values = []

Iterate through the row periods and create plots¶

for start_row, end_row in row_periods: period_data = allticks_data[["YATAS", "BANVT"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing period_data.dropna(inplace=True)

# Extract the data for each column
x = period_data["YATAS"]
y = period_data["BANVT"]

# Calculate the correlation coefficient and correlation
correlation_coefficient = np.corrcoef(x, y)[0, 1]
correlation = np.corrcoef(x, y)[0, 1]

# Create a scatter plot for the period
plt.figure(figsize=(8, 6))
plt.scatter(x, y, label="Data Points", color="blue")

# Set labels and title
plt.xlabel("YATAS")
plt.ylabel("BANVT")
plt.title(f"Scatter Plot for Rows {start_row}-{end_row}")

# Add text to display the correlation



i only want to see the values of those columns at those rows. i dont want to see the correlations ChatGPT If you want to create scatter plots for only the specified row periods without displaying correlation values, you can modify the code to focus on plotting the data only, without calculating and displaying the correlation. Here's how to do that: python Copy code import matplotlib.pyplot as plt

Assuming your DataFrame is named "allticks_data"¶

Define the row periods¶

row_periods = [ (5251, 6000), (10501, 11250), (12751, 13500), (17251, 18000), (23251, 24000), (24001, 24750), (28751, 29250), (31501, 32250), (36001, 36750), (38251, 39000), (39001, 39750), (42751, 43500), (44251, 45000), (47251, 48000), (48001, 48750) ]

Iterate through the row periods and create scatter plots¶

for start_row, end_row in row_periods: period_data = allticks_data[["YATAS", "BANVT"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing

# Extract the data for each column
x = period_data["YATAS"]
y = period_data["BANVT"]

# Create a scatter plot for the period
plt.figure(figsize=(8, 6))
plt.scatter(x, y, label="Data Points", color="blue")

# Set labels and title
plt.xlabel("YATAS")
plt.ylabel("BANVT")
plt.title(f"Scatter Plot for Rows {start_row}-{end_row}")

# Display the plot
plt.legend(loc="upper right")
plt.show()


This code will create separate scatter plots for the specified row periods, focusing on displaying the data without calculating or displaying correlation values.

i want to see these rows of "YATAS" and "BANVT" in that list plotted in a graph. ChatGPT To display the rows of "YATAS" and "BANVT" in the specified row periods as separate graphs, you can create subplots for each row period and plot the data. Here's how you can do it: python Copy code import matplotlib.pyplot as plt

Assuming your DataFrame is named "allticks_data"¶

Define the row periods¶

row_periods = [ (5251, 6000), (10501, 11250), (12751, 13500), (17251, 18000), (23251, 24000), (24001, 24750), (28751, 29250), (31501, 32250), (36001, 36750), (38251, 39000), (39001, 39750), (42751, 43500), (44251, 45000), (47251, 48000), (48001, 48750) ]

Iterate through the row periods and create separate graphs¶

for i, (start_row, end_row) in enumerate(row_periods, start=1): period_data = allticks_data[["YATAS", "BANVT"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing

# Extract the data for each column
x = period_data["YATAS"]
y = period_data["BANVT"]

# Create a new subplot for each row period
plt.subplot(5, 3, i)  # 5 rows, 3 columns of subplots

# Plot the data
plt.plot(x, label="YATAS", color="blue")
plt.plot(y, label="BANVT", color="red")

# Set labels and title
plt.xlabel("Row")
plt.ylabel("Value")
plt.title(f"Rows {start_row}-{end_row}")

# Add a legend
plt.legend()

Adjust subplot layout¶

plt.tight_layout()

Show all subplots¶

plt.show()

This code creates separate graphs (subplots) for each row period, displaying the "YATAS" and "BANVT" values in those rows. You can adjust the number of rows and columns of subplots to fit your preferred layout. In this example, there are 5 rows and 3 columns of subplots.

i dont want to see them as subplots. i want to see them as seperate plots ChatGPT If you want to see each row period as a separate individual plot, not as subplots, you can modify the code to create separate plots for each row period. Here's how to do that: python Copy code import matplotlib.pyplot as plt

Assuming your DataFrame is named "allticks_data"¶

Define the row periods¶

row_periods = [ (5251, 6000), (10501, 11250), (12751, 13500), (17251, 18000), (23251, 24000), (24001, 24750), (28751, 29250), (31501, 32250), (36001, 36750), (38251, 39000), (39001, 39750), (42751, 43500), (44251, 45000), (47251, 48000), (48001, 48750) ]

Iterate through the row periods and create separate plots¶

for start_row, end_row in row_periods: period_data = allticks_data[["YATAS", "BANVT"]].iloc[start_row-1:end_row] # Adjust for 0-based indexing

# Extract the data for each column
x = period_data["YATAS"]
y = period_data["BANVT"]

# Create a new figure for each row period
plt.figure(figsize=(8, 6))

# Plot the data
plt.plot(x, label="YATAS", color="blue")
plt.plot(y, label="BANVT", color="red")

# Set labels and title
plt.xlabel("Row")
plt.ylabel("Value")
plt.title(f"Rows {start_row}-{end_row}")

# Add a legend
plt.legend()

# Show the individual plot
plt.show()


This code creates separate individual plots for each row period, displaying the "YATAS" and "BANVT" values in those rows without using subplots.

as i said, in this data set; there are 50011 rows and 61 columns. The first column consist of timestamps, other 60 columns each belongs to a different company's stock prices at that time stamp. How can i apply Principal Component Analysis to this data set ChatGPT Principal Component Analysis (PCA) is a dimensionality reduction technique commonly used for data with multiple features or dimensions. In your case, you have a dataset with 50011 rows and 61 columns, where the first column contains timestamps and the remaining 60 columns represent different company's stock prices at those timestamps. You can apply PCA to this dataset as follows: Data Preprocessing: Remove the timestamp column, as PCA operates on numerical data, not timestamps. Handle any missing data by imputing missing values or removing rows with missing data, as PCA doesn't work well with missing values. Standardization (Optional but Recommended): Standardize the data by subtracting the mean and dividing by the standard deviation for each of the 60 columns containing stock prices. Standardization is recommended for PCA to give equal weight to all features. Apply PCA: Use a library like scikit-learn in Python to apply PCA. Here's an example of how to do it: python Copy code from sklearn.decomposition import PCA

Assuming your DataFrame is named "allticks_data" with the stock prices¶

Drop the timestamp column¶

stock_data = allticks_data.drop(columns=['timestamp'])

Optionally, standardize the data¶

from sklearn.preprocessing import StandardScaler scaler = StandardScaler() stock_data_standardized = scaler.fit_transform(stock_data)

Initialize and fit the PCA model¶

n_components = 60 # Number of principal components to retain pca = PCA(n_components=n_components) pca.fit(stock_data_standardized) # Use the standardized data if you performed standardization

Get the principal components¶

principalcomponents = pca.components

Get the explained variance ratio¶

explained_variance_ratio = pca.explained_varianceratio

The n_components parameter in PCA allows you to specify how many principal components you want to retain. You can choose the number based on the variance you want to explain. explained_variance_ratio will tell you the proportion of variance explained by each principal component. Visualization (Optional): You can visualize the explained variance to decide how many principal components to retain. A scree plot or a cumulative explained variance plot can be helpful for this purpose. Transform Data: You can transform your data using the selected number of principal components if you want to use the reduced-dimensional representation for further analysis. Interpret Results: You can analyze the principal components to understand which features (stocks) contribute most to each component. Remember that the number of principal components to retain is a crucial decision and should be based on the amount of variance you want to capture while reducing the dimensionality of your data. You might want to experiment with different values of n_components and see how much variance is explained to make an informed decision.

can i fill in the missing rows with the average of the existing nearest upper and below values? can you add this to the code before applying the pca ChatGPT Certainly, you can fill in missing values in a DataFrame by taking the average of the nearest upper and lower values. You can use the fillna() method in pandas to accomplish this. Here's how you can update your code to fill in missing values with the average of the nearest neighbors before applying PCA: python Copy code from sklearn.decomposition import PCA import pandas as pd

Assuming your DataFrame is named "allticks_data" with the stock prices¶

Drop the timestamp column¶

stock_data = allticks_data.drop(columns=['timestamp'])

Fill in missing values with the average of the nearest neighbors¶

stock_data_filled = stock_data.interpolate(method='linear', axis=0)

Optionally, standardize the data¶

from sklearn.preprocessing import StandardScaler scaler = StandardScaler() stock_data_standardized = scaler.fit_transform(stock_data_filled)

Initialize and fit the PCA model¶

n_components = 60 # Number of principal components to retain pca = PCA(n_components=n_components) pca.fit(stock_data_standardized) # Use the standardized data if you performed standardization

Get the principal components¶

principalcomponents = pca.components

Get the explained variance ratio¶

explained_variance_ratio = pca.explained_varianceratio

In this code, we use the interpolate method with the linear interpolation method to fill in missing values in each column. This method linearly interpolates the missing values based on the values of their nearest upper and lower neighbors in the column. After filling in the missing values, you can proceed with PCA as before.

it still gives the error "ValueError: Input X contains NaN. PCA does not accept missing values encoded as NaN natively." ChatGPT I apologize for the confusion. You're right; PCA in scikit-learn does not handle missing values natively, and interpolation using the interpolate method may not be sufficient. In this case, you need to handle the missing values more explicitly. One common approach is to impute missing values with the mean of the respective column. Here's how you can do it: python Copy code from sklearn.decomposition import PCA import pandas as pd from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler

Assuming your DataFrame is named "allticks_data" with the stock prices¶

Drop the timestamp column¶

stock_data = allticks_data.drop(columns=['timestamp'])

Create an imputer to fill missing values with the column mean¶

imputer = SimpleImputer(strategy='mean') stock_data_imputed = imputer.fit_transform(stock_data)

Standardize the data¶

scaler = StandardScaler() stock_data_standardized = scaler.fit_transform(stock_data_imputed)

Initialize and fit the PCA model¶

n_components = 60 # Number of principal components to retain pca = PCA(n_components=n_components) pca.fit(stock_data_standardized)

Get the principal components¶

principalcomponents = pca.components

Get the explained variance ratio¶

explained_variance_ratio = pca.explained_varianceratio

In this code: We use the SimpleImputer from scikit-learn to impute missing values with the column mean. This is a common method to handle missing data before applying PCA. After imputing missing values, we standardize the data with StandardScaler. Then, we proceed to perform PCA as previously described. This approach ensures that missing values are handled before PCA, and it should resolve the "ValueError: Input X contains NaN" issue.

how can i interpret the results of explained_variance_ratio array([4.88093861e-01, 1.70283496e-01, 1.07268396e-01, 4.98911574e-02, 3.80809652e-02, 2.14055156e-02, 1.53236261e-02, 1.32797651e-02, 1.02353851e-02, 8.87784475e-03, 8.34348500e-03, 7.10699121e-03, 6.22456829e-03, 5.94513977e-03, 4.79582514e-03, 4.54884774e-03, 3.47738719e-03, 3.08820485e-03, 2.63245555e-03, 2.61131197e-03, 2.22285259e-03, 2.06089345e-03, 1.81811064e-03, 1.71262120e-03, 1.62892002e-03, 1.36144582e-03, 1.28810777e-03, 1.18659269e-03, 1.12189254e-03, 1.06948368e-03, 9.66592430e-04, 9.34899694e-04, 8.18675874e-04, 7.97092634e-04, 7.39834592e-04, 6.91087590e-04, 6.55839655e-04, 6.18756397e-04, 6.07880344e-04, 5.53020787e-04, 4.75100471e-04, 4.63767517e-04, 4.27597596e-04, 4.22311765e-04, 3.99869410e-04, 3.67213385e-04, 3.53299276e-04, 3.32230281e-04, 3.12833764e-04, 2.81917451e-04, 2.53341253e-04, 2.24278065e-04, 2.18195177e-04, 2.09072241e-04, 1.82989643e-04, 1.64178884e-04, 1.52198743e-04, 1.45553903e-04, 1.42375238e-04, 1.02846619e-04]) ChatGPT The explained_variance_ratio array returned by PCA represents the proportion of the total variance in the original dataset that is explained by each principal component. Each value in this array indicates the ratio of variance explained by a particular principal component. In your specific example, you have an array with 60 values, and it represents the variance explained by each of the 60 principal components. To interpret these values, you can consider the following: Variance Explained by Each Component: The first value in the array (0.488) indicates that the first principal component explains approximately 48.8% of the total variance in your data. The second value (0.170) indicates that the second principal component explains approximately 17.0% of the variance, and so on. Cumulative Variance Explained: You can also calculate the cumulative explained variance by summing the values in the array. For example, if you sum the first two values, you get 0.488 + 0.170 = 0.658, which means that the first two principal components explain approximately 65.8% of the total variance in your data. Dimensionality Reduction: These values help you decide how many principal components to retain for dimensionality reduction. If you want to retain a certain percentage of the total variance (e.g., 95%), you can sum the values in the explained_variance_ratio array until you reach or exceed that threshold. The corresponding number of principal components will give you the reduced dimensionality. Interpretation of Principal Components: The values in the array also give you an idea of which principal components capture the most variance. Components with higher values explain more variance and are more important in representing the data. You can also examine the loadings of the original features on these principal components to interpret the relationship between the original features and the principal components. In summary, the explained_variance_ratio array provides valuable information about how well each principal component represents the variance in your data and helps you make decisions about dimensionality reduction and feature importance.

does n_components have to be equal to the column numbers ChatGPT No, the n_components parameter in PCA does not have to be equal to the number of columns (features) in your dataset. In fact, one of the main purposes of PCA is dimensionality reduction, which often involves reducing the number of components (dimensions) to a smaller value than the original number of features. You can choose the value of n_components based on your specific goals, such as: Preserving Variance: You can select n_components to preserve a certain percentage of the total variance in your data. For example, you might choose to retain enough components to capture 95% of the variance. Dimensionality Reduction: If your dataset has a large number of features, you can reduce dimensionality by selecting a smaller value for n_components. This can help mitigate the curse of dimensionality and improve the efficiency of subsequent analysis. Feature Selection: PCA can also be used for feature selection. You might choose a value of n_components that corresponds to the most important components and, therefore, the most important original features. Visualization: In some cases, you might select n_components to facilitate visualization. For example, you might choose two or three components to create 2D or 3D plots. The choice of n_components should be based on the specific objectives of your analysis. It's not required to be equal to the number of columns, and in many cases, it's desirable to select a smaller value to achieve one of the objectives mentioned above.

what does pca do with this data set? i dont understand the benefit of doing this? can you explain ChatGPT Principal Component Analysis (PCA) is a dimensionality reduction technique that can be applied to a dataset like the one you described, which has multiple features (in your case, 60 columns representing stock prices). PCA serves several purposes and provides several benefits: Dimensionality Reduction: One of the main goals of PCA is to reduce the dimensionality of the data. In your dataset, you have 60 columns (features), and it might be challenging to work with such high-dimensional data. PCA allows you to transform the data into a new set of variables (principal components) that capture the most important information while reducing the number of dimensions. This can make subsequent analysis more manageable and computationally efficient. Decorrelation: PCA creates new variables (principal components) that are linear combinations of the original features. These new variables are designed to be uncorrelated with each other. This can help simplify the relationships between variables and reduce multicollinearity, which can be beneficial in various statistical and machine learning tasks. Variance Capture: PCA arranges the principal components in descending order of the variance they explain. The first principal component explains the most variance, and subsequent components explain less. By selecting a subset of these components, you can retain the most important information in your data while reducing noise. Data Compression: PCA can be used for data compression. The reduced-dimensional representation of your data (using fewer principal components) can save storage space and speed up data processing. Visualization: PCA can help with data visualization. By projecting data points onto a lower-dimensional space (e.g., 2D or 3D), you can create scatter plots or other visualizations that allow you to explore and understand your data more easily. Feature Selection: PCA can assist in feature selection. You can identify which original features contribute most to the principal components with the highest variance. This information can guide feature selection in machine learning or statistical modeling. Noise Reduction: By focusing on the principal components with the highest variance and ignoring those with low variance, you effectively filter out noise and outliers in your data. Improved Model Performance: In some cases, reducing the dimensionality of your data through PCA can lead to improved model performance in machine learning tasks, as models may become less prone to overfitting. Overall, PCA is a valuable technique when you have high-dimensional data, and it can serve various purposes, depending on your specific goals. It can help you gain insights into your data, simplify complex datasets, and prepare the data for downstream analysis or modeling.

can i see the components in this pca ChatGPT Yes, you can certainly examine the principal components obtained from PCA. The principal components are linear combinations of the original features that capture the most significant variance in the data. You can view these components to gain insights into the relationships between the original features and how they contribute to each component. Here's how you can do it: python Copy code

Assuming you have already performed PCA and stored it in the 'pca' object¶

'principal_components' contains the principal components¶

Access the principal components¶

principalcomponents = pca.components

Create a DataFrame to visualize the components¶

components_df = pd.DataFrame(principal_components, columns=stock_data.columns)

Display the components¶

print(components_df)

This code will create a DataFrame called components_df where each row corresponds to a principal component, and each column corresponds to an original feature. Each cell in the DataFrame represents the weight of the corresponding feature in the respective principal component. These weights indicate how strongly each feature contributes to the component. By examining the components, you can gain insights into which original features are most influential in each principal component and how the components relate to the underlying structure of your data. The first few principal components often capture the most significant patterns, so you may focus on them for interpretation.

can i have a table related to importance of components (standard deviation, proportion of variance, cumulative proportion) and another table for loadings (columns and components against them) ChatGPT Certainly, you can create tables to display the importance of components, including standard deviation, proportion of variance, and cumulative proportion. Additionally, you can create a table to show the loadings, which represent the relationship between the original columns and the principal components. Here's how you can do it: python Copy code

Assuming you have already performed PCA and stored it in the 'pca' object¶

'explained_variance_ratio' contains the proportion of variance explained by each component¶

Create a table for component statistics¶

component_stats = pd.DataFrame({ 'Component': range(1, len(explained_variance_ratio) + 1), 'Standard Deviation': np.sqrt(pca.explainedvariance), 'Proportion of Variance': pca.explained_varianceratio, 'Cumulative Proportion': np.cumsum(pca.explained_varianceratio) })

Display the component statistics table¶

print("Component Statistics:") print(component_stats)

Create a table for loadings¶

loadings = pd.DataFrame(pca.components_, columns=stock_data.columns)

Display the loadings table¶

print("\nLoadings Table:") print(loadings)

In this code: component_stats is a DataFrame that provides information about each component, including its standard deviation, proportion of variance explained, and cumulative proportion of variance explained. loadings is a DataFrame that shows the loadings, which represent the relationship between the original columns and the principal components. Each row in the loadings table corresponds to a principal component, and each column represents an original feature. The values in the table indicate how strongly each feature contributes to each component. By examining these tables, you can better understand the importance of each component, the proportion of variance explained by each component, and the relationships between the original features and the principal components. This information can be valuable for interpretation and decision-making in your analysis.

can you modify the code you wrote for PCA and add this part ChatGPT Certainly! Here's the modified code that includes the creation of tables for component statistics and loadings: python Copy code from sklearn.decomposition import PCA import pandas as pd import numpy as np

Assuming your DataFrame is named "allticks_data" with the stock prices¶

Drop the timestamp column¶

stock_data = allticks_data.drop(columns=['timestamp'])

Create an imputer to fill missing values with the column mean¶

imputer = SimpleImputer(strategy='mean') stock_data_imputed = imputer.fit_transform(stock_data)

Standardize the data¶

scaler = StandardScaler() stock_data_standardized = scaler.fit_transform(stock_data_imputed)

Initialize and fit the PCA model¶

n_components = 60 # Number of principal components to retain pca = PCA(n_components=n_components) pca.fit(stock_data_standardized)

Get the explained variance ratio¶

explained_variance_ratio = pca.explained_varianceratio

Create a table for component statistics¶

component_stats = pd.DataFrame({ 'Component': range(1, len(explained_variance_ratio) + 1), 'Standard Deviation': np.sqrt(pca.explainedvariance), 'Proportion of Variance': pca.explained_varianceratio, 'Cumulative Proportion': np.cumsum(pca.explained_varianceratio) })

Display the component statistics table¶

print("Component Statistics:") print(component_stats)

Create a table for loadings¶

loadings = pd.DataFrame(pca.components_, columns=stock_data.columns)

Display the loadings table¶

print("\nLoadings Table:") print(loadings)

With this code, you'll perform PCA, calculate the explained variance ratio, and then create tables for component statistics and loadings, as requested. The component statistics table provides insights into the importance of each component, while the loadings table shows the relationship between the original features and the principal components.

how can i interpret the component statistics table? ChatGPT Interpreting the component statistics table generated from PCA is important for understanding the significance of each principal component and deciding how many components to retain. Here's a breakdown of the key statistics in the table: Component Number: This is a simple index that identifies each principal component. The first component is the most important, the second is the second most important, and so on. Standard Deviation: The standard deviation of each principal component measures the spread or variability of data points along that component's direction. Larger standard deviations indicate more spread, suggesting that the component captures more variance in the data. You can use the standard deviation to gauge the scale of each component's importance. Proportion of Variance: This value represents the proportion of the total variance in the original dataset explained by each principal component. It is a critical statistic in PCA. Larger values indicate that the component captures a larger portion of the dataset's variance. For instance, if the first component's proportion of variance is 0.488, it means that the first component explains approximately 48.8% of the total variance in your data. Cumulative Proportion: This is the cumulative sum of the proportions of variance. It shows the cumulative explained variance as you move through the components. You can use this to determine how many components are needed to capture a certain percentage of the total variance. For example, if the cumulative proportion exceeds 0.95, it means that the selected components together explain over 95% of the total variance. Interpretation of the component statistics table often involves looking at the proportion of variance and cumulative proportion. Here's how to interpret these statistics: Proportion of Variance: Larger values indicate more important components. If you are interested in dimensionality reduction or capturing the most significant patterns in your data, you should focus on components with higher proportions of variance. Cumulative Proportion: This column helps you decide how many components to retain. It indicates the fraction of the total variance you're capturing as you add components in order. If you want to retain a certain percentage of the total variance (e.g., 95% or 99%), look for the point in the cumulative proportion column where it exceeds that threshold. The number of components at that point is how many you might consider retaining. In summary, the component statistics table helps you identify the most important components and decide how many components you need to retain to capture a desired percentage of the total variance in your data. It guides the dimensionality reduction process and provides insights into the underlying structure of your dataset.

how can i interpret the loadings table? ChatGPT Interpreting the loadings table generated from PCA is essential for understanding how the original features (columns) contribute to each principal component. The loadings table represents the relationships between the original features and the principal components. Here's how to interpret the loadings: Principal Components (Rows): Each row in the loadings table corresponds to a principal component. The first row represents the loadings for the first principal component, the second row for the second component, and so on. The values in each row indicate how strongly each original feature contributes to that specific component. Original Features (Columns): Each column in the loadings table represents an original feature from your dataset. The values in each column indicate the relationship between that feature and the principal components. The larger the absolute value of a loading in a column, the more that feature contributes to the principal component. Positive and Negative Loadings: The signs (positive or negative) of the loadings are important. A positive loading indicates a positive correlation between the original feature and the principal component. In contrast, a negative loading indicates a negative correlation. Large positive or negative loadings imply that a feature strongly influences the corresponding component. Strength of Loadings: The magnitude (absolute value) of the loading indicates the strength of the relationship. Larger absolute values signify stronger associations between a feature and a component. If the loading is close to 1 (positive or negative), it suggests that the feature has a strong influence on the component. Zero Loadings: Loadings close to zero suggest that the original feature has minimal impact on the corresponding principal component. Features with near-zero loadings are not contributing significantly to that component. To interpret the loadings table: Pattern Recognition: Examine the loadings in each component to identify which original features are strongly associated with that component. High positive or negative loadings indicate that the corresponding feature plays a significant role in defining that component. Feature Importance: Features with the largest absolute loadings in a component are the most important for that component. You can identify which features have the most influence on the overall variation captured by each component. Correlation Direction: Pay attention to the signs of the loadings. Positive loadings indicate a positive correlation between the feature and the component, while negative loadings indicate a negative correlation. Dimension Interpretation: If your principal components correspond to real-world concepts or patterns (e.g., in a stock market dataset, a component might represent a particular industry or sector), you can interpret the loadings to understand which stocks are positively or negatively associated with that component. In summary, the loadings table provides insights into the relationships between original features and principal components, helping you understand how each component captures the variation in your data and which features are most influential in defining those components.

can i see, for each component, which company's have the biggest contribute? ChatGPT Yes, you can identify which original features (in your case, companies) have the largest contributions to each principal component. This information is obtained from the loadings in the loadings table. To determine which companies have the biggest contributions to each component, you can look at the absolute values of the loadings and identify the features with the highest absolute loadings for each component. Here's how you can do it: python Copy code

Assuming you have already performed PCA and obtained the 'loadings' DataFrame¶

Number of components to consider¶

n_components = 5 # Choose the number of components you want to analyze

Create a dictionary to store the top contributing companies for each component¶

top_contributors = {}

Iterate through each component¶

for component in range(n_components):

# Get the loadings for the current component
component_loadings = loadings.iloc[component, :]

# Sort the companies by the absolute loadings (contribution)
sorted_contributors = component_loadings.abs().sort_values(ascending=False)

# Store the top contributing companies for the current component
top_contributors[f'Component {component + 1}'] = sorted_contributors.index[0]  # The top contributor

Print the results¶

for component, top_contributor in top_contributors.items(): print(f"{component}: {top_contributor}")

In this code, we select a certain number of components to analyze (you can change n_components to the desired number). For each component, we extract the loadings, sort them by absolute value in descending order, and store the top contributor (i.e., the company with the largest absolute loading) for that component. The output will show you which company contributes the most to each of the selected components. This information helps you understand which companies are most influential in defining each principal component in your PCA analysis.

"Positive loadings indicate a positive correlation between the feature and the component, while negative loadings indicate a negative correlation." what does positive and negative correlation with the component indicates ChatGPT In the context of Principal Component Analysis (PCA) and its loadings, positive and negative correlations with a component indicate the direction of the relationship between the original features (variables) and that specific component. Here's what positive and negative correlations mean: Positive Correlation: When a feature has a positive loading on a component, it means that an increase in the value of that feature is associated with an increase in the value of the component. In other words, the feature and the component tend to move in the same direction. Features with positive loadings contribute positively to the component, meaning they are positively associated with the pattern represented by that component. Negative Correlation: Conversely, when a feature has a negative loading on a component, it indicates that an increase in the value of that feature is associated with a decrease in the value of the component. The feature and the component tend to move in opposite directions. Features with negative loadings contribute negatively to the component, meaning they are negatively associated with the pattern represented by that component. Understanding the direction of these correlations is crucial for interpreting the principal components. It allows you to grasp the nature of the relationship between the original features and the components. Positive and negative correlations reveal how features contribute to the patterns that the principal components capture. For example, in a stock market dataset, if the first principal component has positive loadings for several technology companies, it indicates that this component represents a common positive trend among those technology companies' stock prices. Conversely, if it has negative loadings for some financial companies, it suggests that these financial companies' stock prices move in the opposite direction of the component. In practical terms, you can use this information to understand which features (companies in your dataset) are positively or negatively associated with the patterns captured by each principal component. This can aid in pattern recognition, feature selection, and data interpretation.

can we say that those who have positive correlations with the componet, have similar stock data trends? ChatGPT Yes, you can infer that features (in your case, companies' stock prices) with positive correlations to the same principal component tend to have similar stock data trends. When multiple features have positive loadings on a particular component, it indicates that these features move in the same direction and share a common pattern or trend in the data. In the context of a stock market dataset, a positive correlation among companies for a specific principal component suggests that these companies' stock prices tend to move together in a similar direction. This might indicate a common underlying factor or trend affecting those companies' stocks. For example, if the first principal component has positive loadings for several technology companies, it implies that these technology companies' stock prices are positively correlated, meaning that when one goes up, the others tend to go up as well. This could indicate a common technology industry trend. Conversely, features (companies) with negative loadings on the same component would move in the opposite direction, suggesting that they are negatively correlated with the companies that have positive loadings. By examining the loadings and correlations between features and principal components, you can identify groups of features that share common trends or patterns in your data, which can be valuable for sector analysis, risk assessment, and investment strategies in the context of a stock market dataset.

is it possible for me to see the companies who have the most similar (i.e. positive or negative in common) correlation signs with a code? ChatGPT Yes, you can write code to identify the companies that have the most similar (positive or negative) correlation signs with each other based on the loadings from PCA. This involves finding pairs of companies that have similar signs in their loadings for a given principal component. Here's how you can approach this: python Copy code import pandas as pd

Assuming you have already performed PCA and obtained the 'loadings' DataFrame¶

'component_number' represents the specific component you want to analyze¶

Get the loadings for the selected component¶

component_loadings = loadings.iloc[component_number, :]

Identify companies with the most similar correlations¶

positive_correlations = component_loadings[component_loadings > 0].index negative_correlations = component_loadings[component_loadings < 0].index

Create lists to store similar companies¶

similar_positive = [] similar_negative = []

Iterate through the positive correlations¶

for company in positive_correlations: for other_company in positive_correlations: if company != other_company: similar_positive.append((company, other_company))

Iterate through the negative correlations¶

for company in negative_correlations: for other_company in negative_correlations: if company != other_company: similar_negative.append((company, other_company))

Display the companies with the most similar positive and negative correlations¶

print(f"Companies with the most similar positive correlations: {similar_positive}") print(f"Companies with the most similar negative correlations: {similar_negative}")

In this code: component_number represents the specific component you want to analyze. You can change this to the component of interest. We identify companies with positive and negative loadings on the selected component. positive_correlations and negative_correlations are lists of company names for each type of correlation. We iterate through these lists to find pairs of companies that have the same correlation sign (either both positive or both negative). These pairs are stored in similar_positive and similar_negative lists. Finally, we print the companies with the most similar positive and negative correlations on the selected component. This code allows you to identify pairs of companies with the most similar correlation signs with respect to the chosen principal component. It can provide insights into which companies tend to move together or in opposite directions based on that component.

i want to see this for all the components ChatGPT To see the companies that have the most similar (positive or negative) correlation signs for all components, you can modify the code to iterate through each component and collect the pairs of companies for each component. Here's how you can do it: python Copy code import pandas as pd

Assuming you have already performed PCA and obtained the 'loadings' DataFrame¶

'n_components' is the total number of components¶

Create dictionaries to store similar companies for positive and negative correlations¶

similar_positive = {} similar_negative = {}

for component_number in range(n_components):

# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]

# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index

# Create lists to store similar companies for the current component
similar_positive[component_number] = []
similar_negative[component_number] = []

# Iterate through the positive correlations
for company in positive_correlations:
    for other_company in positive_correlations:
        if company != other_company:
            similar_positive[component_number].append((company, other_company))

# Iterate through the negative correlations
for company in negative_correlations:
    for other_company in negative_correlations:
        if company != other_company:
            similar_negative[component_number].append((company, other_company))

Display the companies with the most similar positive and negative correlations for each component¶

for component_number in range(n_components): print(f"Component {component_number + 1}:") print(f"Companies with the most similar positive correlations: {similar_positive[component_number]}") print(f"Companies with the most similar negative correlations: {similar_negative[component_number]}\n")

In this code, we iterate through each component and identify pairs of companies with similar correlation signs for positive and negative correlations. The results are stored in dictionaries where the keys represent the component number, and the values are lists of company pairs with similar correlations for that component. The code then prints these results for all components. This allows you to explore which companies tend to move together or in opposite directions for each principal component in your PCA analysis.

i see that this code gives me pairs of companies, but can i see the list of those companies instead ChatGPT If you want to see the lists of companies with similar (positive or negative) correlation signs for each component instead of pairs of companies, you can modify the code to collect the lists of companies and display them. Here's how you can do that: python Copy code import pandas as pd

Assuming you have already performed PCA and obtained the 'loadings' DataFrame¶

'n_components' is the total number of components¶

Create dictionaries to store lists of companies for positive and negative correlations¶

similar_positive = {} similar_negative = {}

for component_number in range(n_components):

# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]

# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index

# Store the lists of companies for the current component
similar_positive[component_number] = list(positive_correlations)
similar_negative[component_number] = list(negative_correlations)

Display the lists of companies with similar positive and negative correlations for each component¶

for component_number in range(n_components): print(f"Component {component_number + 1}:") print(f"Companies with positive correlations: {similar_positive[component_number]}") print(f"Companies with negative correlations: {similar_negative[component_number]}\n")

This modified code collects the lists of companies with similar correlations for positive and negative correlations on each component. It then prints these lists for all components, providing you with a clearer view of which companies share similar correlations for each principal component.

can i see the companies that behave in the most similar way? the companies that have the most number of the same correlation sign for a component in total? ChatGPT If you want to find the companies that behave in the most similar way, meaning they share the most number of the same correlation signs (either positive or negative) for a component across all components, you can modify the code to keep track of such companies. Here's how you can do it: python Copy code import pandas as pd

Assuming you have already performed PCA and obtained the 'loadings' DataFrame¶

'n_components' is the total number of components¶

Create dictionaries to store the companies with similar correlation signs (positive or negative) for each component¶

similar_companies_positive = {} similar_companies_negative = {}

for component_number in range(n_components):

# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]

# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index

# Update the dictionaries to include the companies for the current component
for company in positive_correlations:
    if company not in similar_companies_positive:
        similar_companies_positive[company] = set()
    similar_companies_positive[company].add(component_number)

for company in negative_correlations:
    if company not in similar_companies_negative:
        similar_companies_negative[company] = set()
    similar_companies_negative[company].add(component_number)

Define a function to find companies with the most similar correlations¶

def find_most_similar_companies(similar_companies_dict, threshold): return [company for company, components in similar_companies_dict.items() if len(components) >= threshold]

Set a threshold for similarity (e.g., the number of components that should have the same correlation sign)¶

threshold = n_components # All components must have the same correlation sign

Find companies with the most similar positive and negative correlations¶

most_similar_positive_companies = find_most_similar_companies(similar_companies_positive, threshold) most_similar_negative_companies = find_most_similar_companies(similar_companies_negative, threshold)

Print the results¶

print("Companies with the most similar positive correlations:") print(most_similar_positive_companies)

print("\nCompanies with the most similar negative correlations:") print(most_similar_negative_companies)

In this code: We create dictionaries (similar_companies_positive and similar_companies_negative) to track which companies share the same correlation sign for each component. For each component, we update these dictionaries to include the companies that have positive and negative correlations. We define a function, find_most_similar_companies, which allows us to find companies with the most similar correlations based on a threshold. In this case, the threshold is set to the total number of components (n_components), meaning that companies must have the same correlation sign for all components to be considered the most similar. We find companies with the most similar positive and negative correlations based on the threshold and print the results. This code will give you a list of companies that behave in the most similar way by having the same correlation signs across all components. You can adjust the threshold to control the level of similarity required.

please dont seperate between positive and negative in the similarity part. I want to see the list of pairs of companies with the most common same sign numbers, list the number too ChatGPT Certainly, if you want to find pairs of companies that have the most common same-sign numbers (either positive or negative) across all components and list the number of times they share the same sign, you can modify the code as follows: python Copy code import pandas as pd

Assuming you have already performed PCA and obtained the 'loadings' DataFrame¶

'n_components' is the total number of components¶

Create a dictionary to store the counts of same-sign numbers for pairs of companies¶

similar_sign_counts = {}

for component_number in range(n_components):

# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]

# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index

# Update the dictionary with positive correlations
for i, company1 in enumerate(positive_correlations):
    for j, company2 in enumerate(positive_correlations):
        if i != j:
            pair = (company1, company2)
            if pair not in similar_sign_counts:
                similar_sign_counts[pair] = 0
            similar_sign_counts[pair] += 1

# Update the dictionary with negative correlations
for i, company1 in enumerate(negative_correlations):
    for j, company2 in enumerate(negative_correlations):
        if i != j:
            pair = (company1, company2)
            if pair not in similar_sign_counts:
                similar_sign_counts[pair] = 0
            similar_sign_counts[pair] += 1

Find the pairs of companies with the most common same-sign numbers¶

max_count = max(similar_sign_counts.values()) most_common_pairs = [pair for pair, count in similar_sign_counts.items() if count == max_count]

Print the most common pairs and their common same-sign count¶

print("Pairs of companies with the most common same-sign numbers:") for pair in most_common_pairs: count = similar_sign_counts[pair] print(f"Pair: {pair}, Common Same-Sign Count: {count}")

In this code: We create a dictionary similar_sign_counts to track the count of same-sign numbers for pairs of companies. We iterate through each component and, for positive and negative correlations, update the dictionary with pairs of companies that share the same sign. If a pair doesn't exist in the dictionary, we initialize it with a count of 0 and then increment the count. After processing all components, we find the pairs of companies with the most common same-sign numbers by identifying pairs with the maximum count. Finally, we print the pairs of companies and their common same-sign counts. This code provides you with pairs of companies that have the most common same-sign numbers across all components, along with the count of times they share the same sign.

can you list the first 10 pairs ChatGPT Certainly, to list the first 10 pairs of companies with the most common same-sign numbers across all components, you can modify the code to display a limited number of pairs. Here's how you can do it: python Copy code import pandas as pd

Assuming you have already performed PCA and obtained the 'loadings' DataFrame¶

'n_components' is the total number of components¶

Create a dictionary to store the counts of same-sign numbers for pairs of companies¶

similar_sign_counts = {}

for component_number in range(n_components):

# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]

# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index

# Update the dictionary with positive correlations
for i, company1 in enumerate(positive_correlations):
    for j, company2 in enumerate(positive_correlations):
        if i != j:
            pair = (company1, company2)
            if pair not in similar_sign_counts:
                similar_sign_counts[pair] = 0
            similar_sign_counts[pair] += 1

# Update the dictionary with negative correlations
for i, company1 in enumerate(negative_correlations):
    for j, company2 in enumerate(negative_correlations):
        if i != j:
            pair = (company1, company2)
            if pair not in similar_sign_counts:
                similar_sign_counts[pair] = 0
            similar_sign_counts[pair] += 1

Find the pairs of companies with the most common same-sign numbers¶

sorted_pairs = sorted(similar_sign_counts.items(), key=lambda x: x[1], reverse=True)

Print the first 10 pairs and their common same-sign count¶

print("First 10 pairs of companies with the most common same-sign numbers:") for pair, count in sorted_pairs[:10]: print(f"Pair: {pair}, Common Same-Sign Count: {count}")

This modified code will display the first 10 pairs of companies with the most common same-sign numbers across all components, along with the count of times they share the same sign.

i want to see the first 30 pairs but only print the odd numbers ChatGPT If you want to display the first 30 pairs of companies with the most common same-sign numbers but only print the odd-numbered pairs, you can modify the code as follows: python Copy code import pandas as pd

Assuming you have already performed PCA and obtained the 'loadings' DataFrame¶

'n_components' is the total number of components¶

Create a dictionary to store the counts of same-sign numbers for pairs of companies¶

similar_sign_counts = {}

for component_number in range(n_components):

# Get the loadings for the current component
component_loadings = loadings.iloc[component_number, :]

# Identify companies with positive and negative correlations
positive_correlations = component_loadings[component_loadings > 0].index
negative_correlations = component_loadings[component_loadings < 0].index

# Update the dictionary with positive correlations
for i, company1 in enumerate(positive_correlations):
    for j, company2 in enumerate(positive_correlations):
        if i != j:
            pair = (company1, company2)
            if pair not in similar_sign_counts:
                similar_sign_counts[pair] = 0
            similar_sign_counts[pair] += 1

# Update the dictionary with negative correlations
for i, company1 in enumerate(negative_correlations):
    for j, company2 in enumerate(negative_correlations):
        if i != j:
            pair = (company1, company2)
            if pair not in similar_sign_counts:
                similar_sign_counts[pair] = 0
            similar_sign_counts[pair] += 1

Find the pairs of companies with the most common same-sign numbers¶

sorted_pairs = sorted(similar_sign_counts.items(), key=lambda x: x[1], reverse=True)

Print the first 30 odd-numbered pairs and their common same-sign count¶

print("First 30 odd-numbered pairs of companies with the most common same-sign numbers:") for i, (pair, count) in enumerate(sorted_pairs[:60], start=1): if i % 2 != 0: print(f"Pair: {pair}, Common Same-Sign Count: {count}")

This modified code will display the first 30 odd-numbered pairs of companies with the most common same-sign numbers across all components, along with the count of times they share the same sign.

i want to reduce the dimensions, i.e number of components? how can i pick the ideal number of components? ChatGPT To reduce the dimensionality in a PCA analysis and choose the ideal number of components, you can use the concept of explained variance. The goal is to retain a sufficient amount of variance while reducing the number of components. Here are the steps to pick the ideal number of components: Calculate Explained Variance: After performing PCA, you have a list of explained variance for each component. This list tells you how much variance in the original data each component captures. Plot Explained Variance: Create a scree plot or cumulative explained variance plot. In the scree plot, you'll see the explained variance for each component. In the cumulative explained variance plot, you'll see the cumulative variance explained as you include more components. Select a Threshold: Determine the threshold for the amount of variance you want to retain. For example, you might decide to retain 95% of the total variance. Choose the Number of Components: Count the number of components required to exceed the selected threshold in the cumulative explained variance plot. This is your ideal number of components. Here's an example of how to do this in Python using the scikit-learn library: python Copy code import pandas as pd from sklearn.decomposition import PCA import matplotlib.pyplot as plt

Assuming you have your data in a DataFrame 'data'¶

Perform PCA¶

pca = PCA() pca.fit(data)

Plot explained variance¶

explained_variance = pca.explained_varianceratio cumulative_explained_variance = explained_variance.cumsum()

plt.figure(figsize=(10, 5)) plt.bar(range(1, len(explained_variance) + 1), explained_variance, label="Explained Variance") plt.step(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, where='mid', label="Cumulative Explained Variance") plt.xlabel('Number of Components') plt.ylabel('Variance Explained') plt.legend() plt.grid() plt.show()

Determine the ideal number of components (e.g., retaining 95% of variance)¶

threshold = 0.95 n_components = sum(cumulative_explained_variance <= threshold) + 1 print(f"Ideal number of components to retain {threshold}% of variance: {n_components}")

In the code above, you'll perform PCA, plot the explained variance and cumulative explained variance, and choose the number of components that allows you to retain the desired amount of variance (in this case, 95%). Adjust the threshold to meet your specific requirements for dimensionality reduction.

i already applied PCA, i had 60 components, first 13 covers the 95% of the variance. what should i do with this result? should i redo the pca again with selecting the number of components as 13? ChatGPT If you have already applied PCA and found that the first 13 components cover 95% of the variance in your data, you have a choice to make based on your goals and requirements: Retain the First 13 Components: If the first 13 components capture 95% of the variance and provide meaningful information for your analysis, you can choose to retain only these 13 components. Redoing PCA with fewer components can help reduce the dimensionality of your data, making it computationally more efficient and easier to interpret. Reapply PCA with Fewer Components: If you want to apply PCA with only 13 components, you can reapply PCA with the specific number of components (13) rather than the original 60. This can be beneficial if you prefer working with a reduced dataset or need to perform further analyses with the reduced dimensionality. The decision between these two options depends on your specific use case and whether retaining the first 13 components provides adequate information for your analysis. Reducing the number of components can simplify your data, but it's important to ensure that the reduced dataset still captures the essential patterns and relationships in your data. If you choose to retain the first 13 components, you can simply use those components in your subsequent analysis without reapplying PCA.

"# Select all columns except the first column (timestamps) columns_to_analyze = allticks_data.columns[1:] # Calculate the variance and standard deviation for each selected column variances = allticks_data[columns_to_analyze].var() std_deviations = allticks_data[columns_to_analyze].std() # Create a new DataFrame to store the results statistics_df = pd.DataFrame({ 'Column': columns_to_analyze, 'Variance': variances, 'Standard Deviation': std_deviations }) # Display the calculated statistics print(statistics_df) statistics_df.to_excel('stats.xlsx',index=False)" can i also add coefficient of variance to this code ChatGPT Certainly, you can calculate the coefficient of variation (CV) and add it to your statistics DataFrame. The coefficient of variation is a measure of relative variability and is calculated as the ratio of the standard deviation to the mean. Here's how you can modify your code to include the coefficient of variation: python Copy code import pandas as pd

Select all columns except the first column (timestamps)¶

columns_to_analyze = allticks_data.columns[1:]

Calculate the variance, standard deviation, and coefficient of variation for each selected column¶

variances = allticks_data[columns_to_analyze].var() std_deviations = allticks_data[columns_to_analyze].std() means = allticks_data[columns_to_analyze].mean() coeff_of_variation = (std_deviations / means) * 100 # Multiply by 100 to express as a percentage

Create a new DataFrame to store the results¶

statistics_df = pd.DataFrame({ 'Column': columns_to_analyze, 'Variance': variances, 'Standard Deviation': std_deviations, 'Coefficient of Variation': coeff_of_variation })

Display the calculated statistics¶

print(statistics_df)

Save the statistics to an Excel file¶

statistics_df.to_excel('stats.xlsx', index=False)

In this modified code, we calculate the coefficient of variation by dividing the standard deviation by the mean and then multiplying by 100 to express it as a percentage. The resulting statistics DataFrame now includes the coefficient of variation along with variance and standard deviation. Additionally, it saves the statistics to an Excel file as you originally requested.

In [ ]: